• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Imagination Technologies Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a copy
5  * of this software and associated documentation files (the "Software"), to deal
6  * in the Software without restriction, including without limitation the rights
7  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8  * copies of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <stdint.h>
27 #include <vulkan/vulkan.h>
28 
29 #include "hwdef/rogue_hw_defs.h"
30 #include "hwdef/rogue_hw_utils.h"
31 #include "pvr_bo.h"
32 #include "pvr_csb.h"
33 #include "pvr_csb_enum_helpers.h"
34 #include "pvr_debug.h"
35 #include "pvr_job_common.h"
36 #include "pvr_job_context.h"
37 #include "pvr_job_render.h"
38 #include "pvr_pds.h"
39 #include "pvr_private.h"
40 #include "pvr_rogue_fw.h"
41 #include "pvr_types.h"
42 #include "pvr_winsys.h"
43 #include "util/compiler.h"
44 #include "util/macros.h"
45 #include "util/u_math.h"
46 #include "vk_alloc.h"
47 #include "vk_log.h"
48 #include "vk_util.h"
49 
50 #define ROGUE_BIF_PM_FREELIST_BASE_ADDR_ALIGNSIZE 16U
51 
52 /* FIXME: Is there a hardware define we can use instead? */
53 /* 1 DWord per PM physical page stored in the free list */
54 #define ROGUE_FREE_LIST_ENTRY_SIZE ((uint32_t)sizeof(uint32_t))
55 
56 /* FIXME: The three defines below, for the number of PC, PD and PT entries in a
57  * 4KB page, come from rgxmmudefs_km.h (meaning they're part of the
58  * auto-generated hwdefs). Should these be defined in rogue_mmu.xml? Keeping in
59  * mind that we probably only need these three values. */
60 #define ROGUE_NUM_PC_ENTRIES_PER_PAGE 0x400U
61 
62 #define ROGUE_NUM_PD_ENTRIES_PER_PAGE 0x200U
63 
64 #define ROGUE_NUM_PT_ENTRIES_PER_PAGE 0x200U
65 
66 struct pvr_free_list {
67    struct pvr_device *device;
68 
69    uint64_t size;
70 
71    struct pvr_bo *bo;
72 
73    struct pvr_winsys_free_list *ws_free_list;
74 };
75 
76 /* Macrotile information. */
77 struct pvr_rt_mtile_info {
78    uint32_t tile_size_x;
79    uint32_t tile_size_y;
80 
81    uint32_t num_tiles_x;
82    uint32_t num_tiles_y;
83 
84    uint32_t tiles_per_mtile_x;
85    uint32_t tiles_per_mtile_y;
86 
87    uint32_t x_tile_max;
88    uint32_t y_tile_max;
89 
90    uint32_t mtiles_x;
91    uint32_t mtiles_y;
92 
93    uint32_t mtile_x1;
94    uint32_t mtile_y1;
95    uint32_t mtile_x2;
96    uint32_t mtile_y2;
97    uint32_t mtile_x3;
98    uint32_t mtile_y3;
99 
100    uint32_t mtile_stride;
101 };
102 
103 struct pvr_rt_dataset {
104    struct pvr_device *device;
105 
106    /* RT dataset information */
107    uint32_t width;
108    uint32_t height;
109    uint32_t samples;
110    uint32_t layers;
111 
112    struct pvr_free_list *global_free_list;
113    struct pvr_free_list *local_free_list;
114 
115    struct pvr_bo *vheap_rtc_bo;
116    pvr_dev_addr_t vheap_dev_addr;
117    pvr_dev_addr_t rtc_dev_addr;
118 
119    struct pvr_bo *tpc_bo;
120    uint64_t tpc_stride;
121    uint64_t tpc_size;
122 
123    struct pvr_winsys_rt_dataset *ws_rt_dataset;
124 
125    /* RT data information */
126    struct pvr_bo *mta_mlist_bo;
127 
128    struct pvr_bo *rgn_headers_bo;
129    uint64_t rgn_headers_stride;
130 
131    bool need_frag;
132 
133    uint8_t rt_data_idx;
134 
135    struct {
136       pvr_dev_addr_t mta_dev_addr;
137       pvr_dev_addr_t mlist_dev_addr;
138       pvr_dev_addr_t rgn_headers_dev_addr;
139    } rt_datas[ROGUE_NUM_RTDATAS];
140 };
141 
pvr_free_list_create(struct pvr_device * device,uint32_t initial_size,uint32_t max_size,uint32_t grow_size,uint32_t grow_threshold,struct pvr_free_list * parent_free_list,struct pvr_free_list ** const free_list_out)142 VkResult pvr_free_list_create(struct pvr_device *device,
143                               uint32_t initial_size,
144                               uint32_t max_size,
145                               uint32_t grow_size,
146                               uint32_t grow_threshold,
147                               struct pvr_free_list *parent_free_list,
148                               struct pvr_free_list **const free_list_out)
149 {
150    struct pvr_winsys_free_list *parent_ws_free_list =
151       parent_free_list ? parent_free_list->ws_free_list : NULL;
152    const uint64_t bo_flags = PVR_BO_ALLOC_FLAG_GPU_UNCACHED |
153                              PVR_BO_ALLOC_FLAG_PM_FW_PROTECT;
154    struct pvr_free_list *free_list;
155    uint32_t cache_line_size;
156    uint32_t initial_num_pages;
157    uint32_t grow_num_pages;
158    uint32_t max_num_pages;
159    uint64_t addr_alignment;
160    uint64_t size_alignment;
161    uint64_t size;
162    VkResult result;
163 
164    assert((initial_size + grow_size) <= max_size);
165    assert(max_size != 0);
166    assert(grow_threshold <= 100);
167 
168    /* Make sure the free list is created with at least a single page. */
169    if (initial_size == 0)
170       initial_size = ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE;
171 
172    /* The freelists sizes must respect the PM freelist base address alignment
173     * requirement. As the freelist entries are cached by the SLC, it's also
174     * necessary to ensure the sizes respect the SLC cache line size to avoid
175     * invalid entries appearing in the cache, which would be problematic after
176     * a grow operation, as the SLC entries aren't invalidated. We do this by
177     * making sure the freelist values are appropriately aligned.
178     *
179     * To calculate the alignment, we first take the largest of the freelist
180     * base address alignment and the SLC cache line size. We then divide this
181     * by the freelist entry size to determine the number of freelist entries
182     * required by the PM. Finally, as each entry holds a single PM physical
183     * page, we multiple the number of entries by the page size.
184     *
185     * As an example, if the base address alignment is 16 bytes, the SLC cache
186     * line size is 64 bytes and the freelist entry size is 4 bytes then 16
187     * entries are required, as we take the SLC cacheline size (being the larger
188     * of the two values) and divide this by 4. If the PM page size is 4096
189     * bytes then we end up with an alignment of 65536 bytes.
190     */
191    cache_line_size = rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
192 
193    addr_alignment =
194       MAX2(ROGUE_BIF_PM_FREELIST_BASE_ADDR_ALIGNSIZE, cache_line_size);
195    size_alignment = (addr_alignment / ROGUE_FREE_LIST_ENTRY_SIZE) *
196                     ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE;
197 
198    assert(util_is_power_of_two_nonzero(size_alignment));
199 
200    initial_size = align64(initial_size, size_alignment);
201    max_size = align64(max_size, size_alignment);
202    grow_size = align64(grow_size, size_alignment);
203 
204    /* Make sure the 'max' size doesn't exceed what the firmware supports and
205     * adjust the other sizes accordingly.
206     */
207    if (max_size > ROGUE_FREE_LIST_MAX_SIZE) {
208       max_size = ROGUE_FREE_LIST_MAX_SIZE;
209       assert(align64(max_size, size_alignment) == max_size);
210    }
211 
212    if (initial_size > max_size)
213       initial_size = max_size;
214 
215    if (initial_size == max_size)
216       grow_size = 0;
217 
218    initial_num_pages = initial_size >> ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT;
219    max_num_pages = max_size >> ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT;
220    grow_num_pages = grow_size >> ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT;
221 
222    /* Calculate the size of the buffer needed to store the free list entries
223     * based on the maximum number of pages we can have.
224     */
225    size = max_num_pages * ROGUE_FREE_LIST_ENTRY_SIZE;
226    assert(align64(size, addr_alignment) == size);
227 
228    free_list = vk_alloc(&device->vk.alloc,
229                         sizeof(*free_list),
230                         8,
231                         VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
232    if (!free_list)
233       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
234 
235    /* FIXME: The memory is mapped GPU uncached, but this seems to contradict
236     * the comment above about aligning to the SLC cache line size.
237     */
238    result = pvr_bo_alloc(device,
239                          device->heaps.general_heap,
240                          size,
241                          addr_alignment,
242                          bo_flags,
243                          &free_list->bo);
244    if (result != VK_SUCCESS)
245       goto err_vk_free_free_list;
246 
247    result = device->ws->ops->free_list_create(device->ws,
248                                               free_list->bo->vma,
249                                               initial_num_pages,
250                                               max_num_pages,
251                                               grow_num_pages,
252                                               grow_threshold,
253                                               parent_ws_free_list,
254                                               &free_list->ws_free_list);
255    if (result != VK_SUCCESS)
256       goto err_pvr_bo_free_bo;
257 
258    free_list->device = device;
259    free_list->size = size;
260 
261    *free_list_out = free_list;
262 
263    return VK_SUCCESS;
264 
265 err_pvr_bo_free_bo:
266    pvr_bo_free(device, free_list->bo);
267 
268 err_vk_free_free_list:
269    vk_free(&device->vk.alloc, free_list);
270 
271    return result;
272 }
273 
pvr_free_list_destroy(struct pvr_free_list * free_list)274 void pvr_free_list_destroy(struct pvr_free_list *free_list)
275 {
276    struct pvr_device *device = free_list->device;
277 
278    device->ws->ops->free_list_destroy(free_list->ws_free_list);
279    pvr_bo_free(device, free_list->bo);
280    vk_free(&device->vk.alloc, free_list);
281 }
282 
pvr_get_samples_in_xy(uint32_t samples,uint32_t * const x_out,uint32_t * const y_out)283 static inline void pvr_get_samples_in_xy(uint32_t samples,
284                                          uint32_t *const x_out,
285                                          uint32_t *const y_out)
286 {
287    switch (samples) {
288    case 1:
289       *x_out = 1;
290       *y_out = 1;
291       break;
292    case 2:
293       *x_out = 1;
294       *y_out = 2;
295       break;
296    case 4:
297       *x_out = 2;
298       *y_out = 2;
299       break;
300    case 8:
301       *x_out = 2;
302       *y_out = 4;
303       break;
304    default:
305       unreachable("Unsupported number of samples");
306    }
307 }
308 
pvr_rt_mtile_info_init(struct pvr_device * device,struct pvr_rt_mtile_info * info,uint32_t width,uint32_t height,uint32_t samples)309 static void pvr_rt_mtile_info_init(struct pvr_device *device,
310                                    struct pvr_rt_mtile_info *info,
311                                    uint32_t width,
312                                    uint32_t height,
313                                    uint32_t samples)
314 {
315    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
316    uint32_t samples_in_x;
317    uint32_t samples_in_y;
318 
319    pvr_get_samples_in_xy(samples, &samples_in_x, &samples_in_y);
320 
321    info->tile_size_x = PVR_GET_FEATURE_VALUE(dev_info, tile_size_x, 1);
322    info->tile_size_y = PVR_GET_FEATURE_VALUE(dev_info, tile_size_y, 1);
323 
324    info->num_tiles_x = DIV_ROUND_UP(width, info->tile_size_x);
325    info->num_tiles_y = DIV_ROUND_UP(height, info->tile_size_y);
326 
327    rogue_get_num_macrotiles_xy(dev_info, &info->mtiles_x, &info->mtiles_y);
328 
329    if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
330       assert(PVR_GET_FEATURE_VALUE(dev_info,
331                                    simple_parameter_format_version,
332                                    0) == 2);
333       /* Set up 16 macrotiles with a multiple of 2x2 tiles per macrotile,
334        * which is aligned to a tile group.
335        */
336       info->mtile_x1 = DIV_ROUND_UP(info->num_tiles_x, 8) * 2;
337       info->mtile_y1 = DIV_ROUND_UP(info->num_tiles_y, 8) * 2;
338       info->mtile_x2 = 0;
339       info->mtile_y2 = 0;
340       info->mtile_x3 = 0;
341       info->mtile_y3 = 0;
342       info->x_tile_max = ALIGN_POT(info->num_tiles_x, 2) - 1;
343       info->y_tile_max = ALIGN_POT(info->num_tiles_y, 2) - 1;
344    } else {
345       /* Set up 16 macrotiles with a multiple of 4x4 tiles per macrotile. */
346       info->mtile_x1 = ALIGN_POT(DIV_ROUND_UP(info->num_tiles_x, 4), 4);
347       info->mtile_y1 = ALIGN_POT(DIV_ROUND_UP(info->num_tiles_y, 4), 4);
348       info->mtile_x2 = info->mtile_x1 * 2;
349       info->mtile_y2 = info->mtile_y1 * 2;
350       info->mtile_x3 = info->mtile_x1 * 3;
351       info->mtile_y3 = info->mtile_y1 * 3;
352       info->x_tile_max = info->num_tiles_x - 1;
353       info->y_tile_max = info->num_tiles_y - 1;
354    }
355 
356    info->tiles_per_mtile_x = info->mtile_x1 * samples_in_x;
357    info->tiles_per_mtile_y = info->mtile_y1 * samples_in_y;
358 
359    info->mtile_stride = info->mtile_x1 * info->mtile_y1;
360 }
361 
362 /* Note that the unit of the return value depends on the GPU. For cores with the
363  * simple_internal_parameter_format feature the returned size is interpreted as
364  * the number of region headers. For cores without this feature its interpreted
365  * as the size in dwords.
366  */
367 static uint64_t
pvr_rt_get_isp_region_size(struct pvr_device * device,const struct pvr_rt_mtile_info * mtile_info)368 pvr_rt_get_isp_region_size(struct pvr_device *device,
369                            const struct pvr_rt_mtile_info *mtile_info)
370 {
371    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
372    uint64_t rgn_size =
373       mtile_info->tiles_per_mtile_x * mtile_info->tiles_per_mtile_y;
374 
375    if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
376       uint32_t version;
377 
378       rgn_size *= mtile_info->mtiles_x * mtile_info->mtiles_y;
379 
380       if (PVR_FEATURE_VALUE(dev_info,
381                             simple_parameter_format_version,
382                             &version)) {
383          version = 0;
384       }
385 
386       if (version == 2) {
387          /* One region header per 2x2 tile group. */
388          rgn_size /= (2U * 2U);
389       }
390    } else {
391       const uint64_t rgn_header_size = rogue_get_region_header_size(dev_info);
392 
393       /* Round up to next dword to prevent IPF overrun and convert to bytes.
394        */
395       rgn_size = DIV_ROUND_UP(rgn_size * rgn_header_size, 4);
396    }
397 
398    return rgn_size;
399 }
400 
pvr_rt_vheap_rtc_data_init(struct pvr_device * device,struct pvr_rt_dataset * rt_dataset,uint32_t layers)401 static VkResult pvr_rt_vheap_rtc_data_init(struct pvr_device *device,
402                                            struct pvr_rt_dataset *rt_dataset,
403                                            uint32_t layers)
404 {
405    const uint64_t bo_flags = PVR_BO_ALLOC_FLAG_GPU_UNCACHED |
406                              PVR_BO_ALLOC_FLAG_ZERO_ON_ALLOC;
407    uint64_t vheap_size;
408    uint32_t alignment;
409    uint64_t rtc_size;
410    VkResult result;
411 
412    vheap_size = ROGUE_CR_PM_VHEAP_TABLE_SIZE * ROGUE_PM_VHEAP_ENTRY_SIZE;
413 
414    if (layers > 1) {
415       uint64_t rtc_entries;
416 
417       vheap_size = ALIGN_POT(vheap_size, PVRX(CR_TA_RTC_ADDR_BASE_ALIGNMENT));
418 
419       rtc_entries = ROGUE_NUM_TEAC + ROGUE_NUM_TE + ROGUE_NUM_VCE;
420       if (PVR_HAS_QUIRK(&device->pdevice->dev_info, 48545))
421          rtc_entries += ROGUE_NUM_TE;
422 
423       rtc_size = rtc_entries * ROGUE_RTC_SIZE_IN_BYTES;
424    } else {
425       rtc_size = 0;
426    }
427 
428    alignment = MAX2(PVRX(CR_PM_VHEAP_TABLE_BASE_ADDR_ALIGNMENT),
429                     PVRX(CR_TA_RTC_ADDR_BASE_ALIGNMENT));
430 
431    result = pvr_bo_alloc(device,
432                          device->heaps.general_heap,
433                          vheap_size + rtc_size,
434                          alignment,
435                          bo_flags,
436                          &rt_dataset->vheap_rtc_bo);
437    if (result != VK_SUCCESS)
438       return result;
439 
440    rt_dataset->vheap_dev_addr = rt_dataset->vheap_rtc_bo->vma->dev_addr;
441 
442    if (rtc_size > 0) {
443       rt_dataset->rtc_dev_addr =
444          PVR_DEV_ADDR_OFFSET(rt_dataset->vheap_dev_addr, vheap_size);
445    } else {
446       rt_dataset->rtc_dev_addr = PVR_DEV_ADDR_INVALID;
447    }
448 
449    return VK_SUCCESS;
450 }
451 
pvr_rt_vheap_rtc_data_fini(struct pvr_rt_dataset * rt_dataset)452 static void pvr_rt_vheap_rtc_data_fini(struct pvr_rt_dataset *rt_dataset)
453 {
454    rt_dataset->rtc_dev_addr = PVR_DEV_ADDR_INVALID;
455 
456    pvr_bo_free(rt_dataset->device, rt_dataset->vheap_rtc_bo);
457    rt_dataset->vheap_rtc_bo = NULL;
458 }
459 
460 static void
pvr_rt_get_tail_ptr_stride_size(const struct pvr_device * device,const struct pvr_rt_mtile_info * mtile_info,uint32_t layers,uint64_t * const stride_out,uint64_t * const size_out)461 pvr_rt_get_tail_ptr_stride_size(const struct pvr_device *device,
462                                 const struct pvr_rt_mtile_info *mtile_info,
463                                 uint32_t layers,
464                                 uint64_t *const stride_out,
465                                 uint64_t *const size_out)
466 {
467    uint32_t max_num_mtiles;
468    uint32_t num_mtiles_x;
469    uint32_t num_mtiles_y;
470    uint32_t version;
471    uint64_t size;
472 
473    num_mtiles_x = mtile_info->mtiles_x * mtile_info->tiles_per_mtile_x;
474    num_mtiles_y = mtile_info->mtiles_y * mtile_info->tiles_per_mtile_y;
475 
476    max_num_mtiles = MAX2(util_next_power_of_two64(num_mtiles_x),
477                          util_next_power_of_two64(num_mtiles_y));
478 
479    size = max_num_mtiles * max_num_mtiles;
480 
481    if (PVR_FEATURE_VALUE(&device->pdevice->dev_info,
482                          simple_parameter_format_version,
483                          &version)) {
484       version = 0;
485    }
486 
487    if (version == 2) {
488       /* One tail pointer cache entry per 2x2 tile group. */
489       size /= (2U * 2U);
490    }
491 
492    size *= ROGUE_TAIL_POINTER_SIZE;
493 
494    if (layers > 1) {
495       size = ALIGN_POT(size, ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE);
496 
497       *stride_out = size / ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE;
498       *size_out = size * layers;
499    } else {
500       *stride_out = 0;
501       *size_out = size;
502    }
503 }
504 
pvr_rt_tpc_data_init(struct pvr_device * device,struct pvr_rt_dataset * rt_dataset,const struct pvr_rt_mtile_info * mtile_info,uint32_t layers)505 static VkResult pvr_rt_tpc_data_init(struct pvr_device *device,
506                                      struct pvr_rt_dataset *rt_dataset,
507                                      const struct pvr_rt_mtile_info *mtile_info,
508                                      uint32_t layers)
509 {
510    const uint64_t bo_flags = PVR_BO_ALLOC_FLAG_GPU_UNCACHED |
511                              PVR_BO_ALLOC_FLAG_ZERO_ON_ALLOC;
512    uint64_t tpc_size;
513 
514    pvr_rt_get_tail_ptr_stride_size(device,
515                                    mtile_info,
516                                    layers,
517                                    &rt_dataset->tpc_stride,
518                                    &rt_dataset->tpc_size);
519    tpc_size = ALIGN_POT(rt_dataset->tpc_size, ROGUE_TE_TPC_CACHE_LINE_SIZE);
520 
521    return pvr_bo_alloc(device,
522                        device->heaps.general_heap,
523                        tpc_size,
524                        PVRX(CR_TE_TPC_ADDR_BASE_ALIGNMENT),
525                        bo_flags,
526                        &rt_dataset->tpc_bo);
527 }
528 
pvr_rt_tpc_data_fini(struct pvr_rt_dataset * rt_dataset)529 static void pvr_rt_tpc_data_fini(struct pvr_rt_dataset *rt_dataset)
530 {
531    pvr_bo_free(rt_dataset->device, rt_dataset->tpc_bo);
532    rt_dataset->tpc_bo = NULL;
533 }
534 
535 static uint32_t
pvr_rt_get_mlist_size(const struct pvr_free_list * global_free_list,const struct pvr_free_list * local_free_list)536 pvr_rt_get_mlist_size(const struct pvr_free_list *global_free_list,
537                       const struct pvr_free_list *local_free_list)
538 {
539    uint32_t num_pte_pages;
540    uint32_t num_pde_pages;
541    uint32_t num_pce_pages;
542    uint64_t total_pages;
543    uint32_t mlist_size;
544 
545    assert(global_free_list->size + local_free_list->size <=
546           ROGUE_PM_MAX_PB_VIRT_ADDR_SPACE);
547 
548    total_pages = (global_free_list->size + local_free_list->size) >>
549                  ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT;
550 
551    /* Calculate the total number of physical pages required to hold the page
552     * table, directory and catalog entries for the freelist pages.
553     */
554    num_pte_pages = DIV_ROUND_UP(total_pages, ROGUE_NUM_PT_ENTRIES_PER_PAGE);
555    num_pde_pages = DIV_ROUND_UP(num_pte_pages, ROGUE_NUM_PD_ENTRIES_PER_PAGE);
556    num_pce_pages = DIV_ROUND_UP(num_pde_pages, ROGUE_NUM_PC_ENTRIES_PER_PAGE);
557 
558    /* Calculate the MList size considering the total number of pages in the PB
559     * are shared among all the PM address spaces.
560     */
561    mlist_size = (num_pce_pages + num_pde_pages + num_pte_pages) *
562                 ROGUE_NUM_PM_ADDRESS_SPACES * ROGUE_MLIST_ENTRY_STRIDE;
563 
564    return ALIGN_POT(mlist_size, ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE);
565 }
566 
pvr_rt_get_region_headers_stride_size(const struct pvr_device * device,const struct pvr_rt_mtile_info * mtile_info,uint32_t layers,uint64_t * const stride_out,uint64_t * const size_out)567 static void pvr_rt_get_region_headers_stride_size(
568    const struct pvr_device *device,
569    const struct pvr_rt_mtile_info *mtile_info,
570    uint32_t layers,
571    uint64_t *const stride_out,
572    uint64_t *const size_out)
573 {
574    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
575    const uint32_t rgn_header_size = rogue_get_region_header_size(dev_info);
576    uint32_t rgn_headers_size;
577    uint32_t num_tiles_x;
578    uint32_t num_tiles_y;
579    uint32_t group_size;
580    uint32_t version;
581 
582    if (PVR_FEATURE_VALUE(dev_info, simple_parameter_format_version, &version))
583       version = 0;
584 
585    group_size = version == 2 ? 2 : 1;
586 
587    num_tiles_x = mtile_info->mtiles_x * mtile_info->tiles_per_mtile_x;
588    num_tiles_y = mtile_info->mtiles_y * mtile_info->tiles_per_mtile_y;
589 
590    rgn_headers_size =
591       (num_tiles_x / group_size) * (num_tiles_y / group_size) * rgn_header_size;
592 
593    if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
594       rgn_headers_size =
595          ALIGN_POT(rgn_headers_size, PVRX(CR_TE_PSGREGION_ADDR_BASE_ALIGNMENT));
596    }
597 
598    if (layers > 1) {
599       rgn_headers_size =
600          ALIGN_POT(rgn_headers_size, PVRX(CR_TE_PSG_REGION_STRIDE_UNIT_SIZE));
601    }
602 
603    *stride_out = rgn_header_size;
604    *size_out = rgn_headers_size * layers;
605 }
606 
607 static VkResult
pvr_rt_mta_mlist_data_init(struct pvr_device * device,struct pvr_rt_dataset * rt_dataset,const struct pvr_free_list * global_free_list,const struct pvr_free_list * local_free_list,const struct pvr_rt_mtile_info * mtile_info)608 pvr_rt_mta_mlist_data_init(struct pvr_device *device,
609                            struct pvr_rt_dataset *rt_dataset,
610                            const struct pvr_free_list *global_free_list,
611                            const struct pvr_free_list *local_free_list,
612                            const struct pvr_rt_mtile_info *mtile_info)
613 {
614    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
615    const uint32_t mlist_size =
616       pvr_rt_get_mlist_size(global_free_list, local_free_list);
617    uint32_t mta_size = rogue_get_macrotile_array_size(dev_info);
618    const uint32_t num_rt_datas = ARRAY_SIZE(rt_dataset->rt_datas);
619    uint32_t rt_datas_mlist_size;
620    uint32_t rt_datas_mta_size;
621    pvr_dev_addr_t dev_addr;
622    VkResult result;
623 
624    /* Allocate memory for macrotile array and Mlist for all RT datas.
625     *
626     * Allocation layout: MTA[0..N] + Mlist alignment padding + Mlist[0..N].
627     *
628     * N is number of RT datas.
629     */
630    rt_datas_mta_size = ALIGN_POT(mta_size * num_rt_datas,
631                                  PVRX(CR_PM_MLIST0_BASE_ADDR_ALIGNMENT));
632    rt_datas_mlist_size = mlist_size * num_rt_datas;
633 
634    result = pvr_bo_alloc(device,
635                          device->heaps.general_heap,
636                          rt_datas_mta_size + rt_datas_mlist_size,
637                          PVRX(CR_PM_MTILE_ARRAY_BASE_ADDR_ALIGNMENT),
638                          PVR_BO_ALLOC_FLAG_GPU_UNCACHED,
639                          &rt_dataset->mta_mlist_bo);
640    if (result != VK_SUCCESS)
641       return result;
642 
643    dev_addr = rt_dataset->mta_mlist_bo->vma->dev_addr;
644 
645    for (uint32_t i = 0; i < num_rt_datas; i++) {
646       if (mta_size != 0) {
647          rt_dataset->rt_datas[i].mta_dev_addr = dev_addr;
648          dev_addr = PVR_DEV_ADDR_OFFSET(dev_addr, mta_size);
649       } else {
650          rt_dataset->rt_datas[i].mta_dev_addr = PVR_DEV_ADDR_INVALID;
651       }
652    }
653 
654    dev_addr = PVR_DEV_ADDR_OFFSET(rt_dataset->mta_mlist_bo->vma->dev_addr,
655                                   rt_datas_mta_size);
656 
657    for (uint32_t i = 0; i < num_rt_datas; i++) {
658       if (mlist_size != 0) {
659          rt_dataset->rt_datas[i].mlist_dev_addr = dev_addr;
660          dev_addr = PVR_DEV_ADDR_OFFSET(dev_addr, mlist_size);
661       } else {
662          rt_dataset->rt_datas[i].mlist_dev_addr = PVR_DEV_ADDR_INVALID;
663       }
664    }
665 
666    return VK_SUCCESS;
667 }
668 
pvr_rt_mta_mlist_data_fini(struct pvr_rt_dataset * rt_dataset)669 static void pvr_rt_mta_mlist_data_fini(struct pvr_rt_dataset *rt_dataset)
670 {
671    for (uint32_t i = 0; i < ARRAY_SIZE(rt_dataset->rt_datas); i++) {
672       rt_dataset->rt_datas[i].mlist_dev_addr = PVR_DEV_ADDR_INVALID;
673       rt_dataset->rt_datas[i].mta_dev_addr = PVR_DEV_ADDR_INVALID;
674    }
675 
676    pvr_bo_free(rt_dataset->device, rt_dataset->mta_mlist_bo);
677    rt_dataset->mta_mlist_bo = NULL;
678 }
679 
680 static VkResult
pvr_rt_rgn_headers_data_init(struct pvr_device * device,struct pvr_rt_dataset * rt_dataset,const struct pvr_rt_mtile_info * mtile_info,uint32_t layers)681 pvr_rt_rgn_headers_data_init(struct pvr_device *device,
682                              struct pvr_rt_dataset *rt_dataset,
683                              const struct pvr_rt_mtile_info *mtile_info,
684                              uint32_t layers)
685 {
686    const uint32_t num_rt_datas = ARRAY_SIZE(rt_dataset->rt_datas);
687    uint64_t rgn_headers_size;
688    pvr_dev_addr_t dev_addr;
689    VkResult result;
690 
691    pvr_rt_get_region_headers_stride_size(device,
692                                          mtile_info,
693                                          layers,
694                                          &rt_dataset->rgn_headers_stride,
695                                          &rgn_headers_size);
696 
697    result = pvr_bo_alloc(device,
698                          device->heaps.rgn_hdr_heap,
699                          rgn_headers_size * num_rt_datas,
700                          PVRX(CR_TE_PSGREGION_ADDR_BASE_ALIGNMENT),
701                          PVR_BO_ALLOC_FLAG_GPU_UNCACHED,
702                          &rt_dataset->rgn_headers_bo);
703    if (result != VK_SUCCESS)
704       return result;
705 
706    dev_addr = rt_dataset->rgn_headers_bo->vma->dev_addr;
707 
708    for (uint32_t i = 0; i < num_rt_datas; i++) {
709       rt_dataset->rt_datas[i].rgn_headers_dev_addr = dev_addr;
710       dev_addr = PVR_DEV_ADDR_OFFSET(dev_addr, rgn_headers_size);
711    }
712 
713    return VK_SUCCESS;
714 }
715 
pvr_rt_rgn_headers_data_fini(struct pvr_rt_dataset * rt_dataset)716 static void pvr_rt_rgn_headers_data_fini(struct pvr_rt_dataset *rt_dataset)
717 {
718    for (uint32_t i = 0; i < ARRAY_SIZE(rt_dataset->rt_datas); i++)
719       rt_dataset->rt_datas[i].rgn_headers_dev_addr = PVR_DEV_ADDR_INVALID;
720 
721    pvr_bo_free(rt_dataset->device, rt_dataset->rgn_headers_bo);
722    rt_dataset->rgn_headers_bo = NULL;
723 }
724 
pvr_rt_datas_init(struct pvr_device * device,struct pvr_rt_dataset * rt_dataset,const struct pvr_free_list * global_free_list,const struct pvr_free_list * local_free_list,const struct pvr_rt_mtile_info * mtile_info,uint32_t layers)725 static VkResult pvr_rt_datas_init(struct pvr_device *device,
726                                   struct pvr_rt_dataset *rt_dataset,
727                                   const struct pvr_free_list *global_free_list,
728                                   const struct pvr_free_list *local_free_list,
729                                   const struct pvr_rt_mtile_info *mtile_info,
730                                   uint32_t layers)
731 {
732    VkResult result;
733 
734    result = pvr_rt_mta_mlist_data_init(device,
735                                        rt_dataset,
736                                        global_free_list,
737                                        local_free_list,
738                                        mtile_info);
739    if (result != VK_SUCCESS)
740       return result;
741 
742    result =
743       pvr_rt_rgn_headers_data_init(device, rt_dataset, mtile_info, layers);
744    if (result != VK_SUCCESS)
745       goto err_pvr_rt_mta_mlist_data_fini;
746 
747    return VK_SUCCESS;
748 
749 err_pvr_rt_mta_mlist_data_fini:
750    pvr_rt_mta_mlist_data_fini(rt_dataset);
751 
752    return VK_SUCCESS;
753 }
754 
pvr_rt_datas_fini(struct pvr_rt_dataset * rt_dataset)755 static void pvr_rt_datas_fini(struct pvr_rt_dataset *rt_dataset)
756 {
757    pvr_rt_rgn_headers_data_fini(rt_dataset);
758    pvr_rt_mta_mlist_data_fini(rt_dataset);
759 }
760 
761 static uint32_t
pvr_rogue_get_cr_isp_mtile_size_val(const struct pvr_device_info * dev_info,uint32_t samples,const struct pvr_rt_mtile_info * mtile_info)762 pvr_rogue_get_cr_isp_mtile_size_val(const struct pvr_device_info *dev_info,
763                                     uint32_t samples,
764                                     const struct pvr_rt_mtile_info *mtile_info)
765 {
766    uint32_t samples_per_pixel =
767       PVR_GET_FEATURE_VALUE(dev_info, isp_samples_per_pixel, 0);
768    uint32_t isp_mtile_size;
769 
770    pvr_csb_pack (&isp_mtile_size, CR_ISP_MTILE_SIZE, value) {
771       value.x = mtile_info->mtile_x1;
772       value.y = mtile_info->mtile_y1;
773 
774       if (samples_per_pixel == 1) {
775          if (samples >= 4)
776             value.x <<= 1;
777 
778          if (samples >= 2)
779             value.y <<= 1;
780       } else if (samples_per_pixel == 2) {
781          if (samples >= 8)
782             value.x <<= 1;
783 
784          if (samples >= 4)
785             value.y <<= 1;
786       } else if (samples_per_pixel == 4) {
787          if (samples >= 8)
788             value.y <<= 1;
789       } else {
790          assert(!"Unsupported ISP samples per pixel value");
791       }
792    }
793 
794    return isp_mtile_size;
795 }
796 
pvr_rogue_get_cr_multisamplectl_val(uint32_t samples,bool y_flip)797 static uint64_t pvr_rogue_get_cr_multisamplectl_val(uint32_t samples,
798                                                     bool y_flip)
799 {
800    static const struct {
801       uint8_t x[8];
802       uint8_t y[8];
803    } sample_positions[4] = {
804       /* 1 sample */
805       {
806          .x = { 8 },
807          .y = { 8 },
808       },
809       /* 2 samples */
810       {
811          .x = { 12, 4 },
812          .y = { 12, 4 },
813       },
814       /* 4 samples */
815       {
816          .x = { 6, 14, 2, 10 },
817          .y = { 2, 6, 10, 14 },
818       },
819       /* 8 samples */
820       {
821          .x = { 9, 7, 13, 5, 3, 1, 11, 15 },
822          .y = { 5, 11, 9, 3, 13, 7, 15, 1 },
823       },
824    };
825    uint64_t multisamplectl;
826    uint8_t idx;
827 
828    idx = util_fast_log2(samples);
829    assert(idx < ARRAY_SIZE(sample_positions));
830 
831    pvr_csb_pack (&multisamplectl, CR_PPP_MULTISAMPLECTL, value) {
832       switch (samples) {
833       case 8:
834          value.msaa_x7 = sample_positions[idx].x[7];
835          value.msaa_x6 = sample_positions[idx].x[6];
836          value.msaa_x5 = sample_positions[idx].x[5];
837          value.msaa_x4 = sample_positions[idx].x[4];
838 
839          if (y_flip) {
840             value.msaa_y7 = 16U - sample_positions[idx].y[7];
841             value.msaa_y6 = 16U - sample_positions[idx].y[6];
842             value.msaa_y5 = 16U - sample_positions[idx].y[5];
843             value.msaa_y4 = 16U - sample_positions[idx].y[4];
844          } else {
845             value.msaa_y7 = sample_positions[idx].y[7];
846             value.msaa_y6 = sample_positions[idx].y[6];
847             value.msaa_y5 = sample_positions[idx].y[5];
848             value.msaa_y4 = sample_positions[idx].y[4];
849          }
850 
851          FALLTHROUGH;
852       case 4:
853          value.msaa_x3 = sample_positions[idx].x[3];
854          value.msaa_x2 = sample_positions[idx].x[2];
855 
856          if (y_flip) {
857             value.msaa_y3 = 16U - sample_positions[idx].y[3];
858             value.msaa_y2 = 16U - sample_positions[idx].y[2];
859          } else {
860             value.msaa_y3 = sample_positions[idx].y[3];
861             value.msaa_y2 = sample_positions[idx].y[2];
862          }
863 
864          FALLTHROUGH;
865       case 2:
866          value.msaa_x1 = sample_positions[idx].x[1];
867 
868          if (y_flip) {
869             value.msaa_y1 = 16U - sample_positions[idx].y[1];
870          } else {
871             value.msaa_y1 = sample_positions[idx].y[1];
872          }
873 
874          FALLTHROUGH;
875       case 1:
876          value.msaa_x0 = sample_positions[idx].x[0];
877 
878          if (y_flip) {
879             value.msaa_y0 = 16U - sample_positions[idx].y[0];
880          } else {
881             value.msaa_y0 = sample_positions[idx].y[0];
882          }
883 
884          break;
885       default:
886          unreachable("Unsupported number of samples");
887       }
888    }
889 
890    return multisamplectl;
891 }
892 
893 static uint32_t
pvr_rogue_get_cr_te_aa_val(const struct pvr_device_info * dev_info,uint32_t samples)894 pvr_rogue_get_cr_te_aa_val(const struct pvr_device_info *dev_info,
895                            uint32_t samples)
896 {
897    uint32_t samples_per_pixel =
898       PVR_GET_FEATURE_VALUE(dev_info, isp_samples_per_pixel, 0);
899    uint32_t te_aa;
900 
901    pvr_csb_pack (&te_aa, CR_TE_AA, value) {
902       if (samples_per_pixel == 1) {
903          if (samples >= 2)
904             value.y = true;
905          if (samples >= 4)
906             value.x = true;
907       } else if (samples_per_pixel == 2) {
908          if (samples >= 2)
909             value.x2 = true;
910          if (samples >= 4)
911             value.y = true;
912          if (samples >= 8)
913             value.x = true;
914       } else if (samples_per_pixel == 4) {
915          if (samples >= 2)
916             value.x2 = true;
917          if (samples >= 4)
918             value.y2 = true;
919          if (samples >= 8)
920             value.y = true;
921       } else {
922          assert(!"Unsupported ISP samples per pixel value");
923       }
924    }
925 
926    return te_aa;
927 }
928 
pvr_rt_dataset_ws_create_info_init(struct pvr_rt_dataset * rt_dataset,const struct pvr_rt_mtile_info * mtile_info,struct pvr_winsys_rt_dataset_create_info * create_info)929 static void pvr_rt_dataset_ws_create_info_init(
930    struct pvr_rt_dataset *rt_dataset,
931    const struct pvr_rt_mtile_info *mtile_info,
932    struct pvr_winsys_rt_dataset_create_info *create_info)
933 {
934    struct pvr_device *device = rt_dataset->device;
935    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
936 
937    memset(create_info, 0, sizeof(*create_info));
938 
939    /* Local freelist. */
940    create_info->local_free_list = rt_dataset->local_free_list->ws_free_list;
941 
942    /* ISP register values. */
943    if (PVR_HAS_ERN(dev_info, 42307) &&
944        !(PVR_HAS_FEATURE(dev_info, roguexe) && mtile_info->tile_size_x == 16)) {
945       float value;
946 
947       if (rt_dataset->width != 0) {
948          value =
949             ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR / (float)rt_dataset->width;
950          create_info->isp_merge_lower_x = fui(value);
951 
952          value =
953             ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR / (float)rt_dataset->width;
954          create_info->isp_merge_upper_x = fui(value);
955       }
956 
957       if (rt_dataset->height != 0) {
958          value =
959             ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR / (float)rt_dataset->height;
960          create_info->isp_merge_lower_y = fui(value);
961 
962          value =
963             ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR / (float)rt_dataset->height;
964          create_info->isp_merge_upper_y = fui(value);
965       }
966 
967       value = ((float)rt_dataset->width * ROGUE_ISP_MERGE_SCALE_FACTOR) /
968               (ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR -
969                ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR);
970       create_info->isp_merge_scale_x = fui(value);
971 
972       value = ((float)rt_dataset->height * ROGUE_ISP_MERGE_SCALE_FACTOR) /
973               (ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR -
974                ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR);
975       create_info->isp_merge_scale_y = fui(value);
976    }
977 
978    create_info->isp_mtile_size =
979       pvr_rogue_get_cr_isp_mtile_size_val(dev_info,
980                                           rt_dataset->samples,
981                                           mtile_info);
982 
983    /* PPP register values. */
984    create_info->ppp_multi_sample_ctl =
985       pvr_rogue_get_cr_multisamplectl_val(rt_dataset->samples, false);
986    create_info->ppp_multi_sample_ctl_y_flipped =
987       pvr_rogue_get_cr_multisamplectl_val(rt_dataset->samples, true);
988 
989    pvr_csb_pack (&create_info->ppp_screen, CR_PPP_SCREEN, value) {
990       value.pixxmax = rt_dataset->width - 1;
991       value.pixymax = rt_dataset->height - 1;
992    }
993 
994    /* TE register values. */
995    create_info->te_aa =
996       pvr_rogue_get_cr_te_aa_val(dev_info, rt_dataset->samples);
997 
998    pvr_csb_pack (&create_info->te_mtile1, CR_TE_MTILE1, value) {
999       value.x1 = mtile_info->mtile_x1;
1000       if (!PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
1001          value.x2 = mtile_info->mtile_x2;
1002          value.x3 = mtile_info->mtile_x3;
1003       }
1004    }
1005 
1006    pvr_csb_pack (&create_info->te_mtile2, CR_TE_MTILE2, value) {
1007       value.y1 = mtile_info->mtile_y1;
1008       if (!PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
1009          value.y2 = mtile_info->mtile_y2;
1010          value.y3 = mtile_info->mtile_y3;
1011       }
1012    }
1013 
1014    pvr_csb_pack (&create_info->te_screen, CR_TE_SCREEN, value) {
1015       value.xmax = mtile_info->x_tile_max;
1016       value.ymax = mtile_info->y_tile_max;
1017    }
1018 
1019    /* Allocations and associated information. */
1020    create_info->vheap_table_dev_addr = rt_dataset->vheap_dev_addr;
1021    create_info->rtc_dev_addr = rt_dataset->rtc_dev_addr;
1022 
1023    create_info->tpc_dev_addr = rt_dataset->tpc_bo->vma->dev_addr;
1024    create_info->tpc_stride = rt_dataset->tpc_stride;
1025    create_info->tpc_size = rt_dataset->tpc_size;
1026 
1027    STATIC_ASSERT(ARRAY_SIZE(create_info->rt_datas) ==
1028                  ARRAY_SIZE(rt_dataset->rt_datas));
1029    for (uint32_t i = 0; i < ARRAY_SIZE(create_info->rt_datas); i++) {
1030       create_info->rt_datas[i].pm_mlist_dev_addr =
1031          rt_dataset->rt_datas[i].mlist_dev_addr;
1032       create_info->rt_datas[i].macrotile_array_dev_addr =
1033          rt_dataset->rt_datas[i].mta_dev_addr;
1034       create_info->rt_datas[i].rgn_header_dev_addr =
1035          rt_dataset->rt_datas[i].rgn_headers_dev_addr;
1036    }
1037 
1038    create_info->rgn_header_size =
1039       pvr_rt_get_isp_region_size(device, mtile_info);
1040 
1041    /* Miscellaneous. */
1042    create_info->mtile_stride = mtile_info->mtile_stride;
1043    create_info->max_rts = rt_dataset->layers;
1044 }
1045 
1046 VkResult
pvr_render_target_dataset_create(struct pvr_device * device,uint32_t width,uint32_t height,uint32_t samples,uint32_t layers,struct pvr_rt_dataset ** const rt_dataset_out)1047 pvr_render_target_dataset_create(struct pvr_device *device,
1048                                  uint32_t width,
1049                                  uint32_t height,
1050                                  uint32_t samples,
1051                                  uint32_t layers,
1052                                  struct pvr_rt_dataset **const rt_dataset_out)
1053 {
1054    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
1055    struct pvr_winsys_rt_dataset_create_info rt_dataset_create_info;
1056    struct pvr_rt_mtile_info mtile_info;
1057    struct pvr_rt_dataset *rt_dataset;
1058    VkResult result;
1059 
1060    assert(device->global_free_list);
1061    assert(width <= rogue_get_render_size_max_x(dev_info));
1062    assert(height <= rogue_get_render_size_max_y(dev_info));
1063    assert(layers > 0 && layers <= PVR_MAX_FRAMEBUFFER_LAYERS);
1064 
1065    pvr_rt_mtile_info_init(device, &mtile_info, width, height, samples);
1066 
1067    rt_dataset = vk_zalloc(&device->vk.alloc,
1068                           sizeof(*rt_dataset),
1069                           8,
1070                           VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1071    if (!rt_dataset)
1072       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1073 
1074    rt_dataset->device = device;
1075    rt_dataset->width = width;
1076    rt_dataset->height = height;
1077    rt_dataset->samples = samples;
1078    rt_dataset->layers = layers;
1079    rt_dataset->global_free_list = device->global_free_list;
1080 
1081    /* The maximum supported free list size is based on the assumption that this
1082     * freelist (the "local" freelist) is always the minimum size required by
1083     * the hardware. See the documentation of ROGUE_FREE_LIST_MAX_SIZE for more
1084     * details.
1085     */
1086    result = pvr_free_list_create(device,
1087                                  rogue_get_min_free_list_size(dev_info),
1088                                  rogue_get_min_free_list_size(dev_info),
1089                                  0 /* grow_size */,
1090                                  0 /* grow_threshold */,
1091                                  rt_dataset->global_free_list,
1092                                  &rt_dataset->local_free_list);
1093    if (result != VK_SUCCESS)
1094       goto err_vk_free_rt_dataset;
1095 
1096    result = pvr_rt_vheap_rtc_data_init(device, rt_dataset, layers);
1097    if (result != VK_SUCCESS)
1098       goto err_pvr_free_list_destroy;
1099 
1100    result = pvr_rt_tpc_data_init(device, rt_dataset, &mtile_info, layers);
1101    if (result != VK_SUCCESS)
1102       goto err_pvr_rt_vheap_rtc_data_fini;
1103 
1104    result = pvr_rt_datas_init(device,
1105                               rt_dataset,
1106                               rt_dataset->global_free_list,
1107                               rt_dataset->local_free_list,
1108                               &mtile_info,
1109                               layers);
1110    if (result != VK_SUCCESS)
1111       goto err_pvr_rt_tpc_data_fini;
1112 
1113    /* rt_dataset must be fully initialized by this point since
1114     * pvr_rt_dataset_ws_create_info_init() depends on this.
1115     */
1116    pvr_rt_dataset_ws_create_info_init(rt_dataset,
1117                                       &mtile_info,
1118                                       &rt_dataset_create_info);
1119 
1120    result =
1121       device->ws->ops->render_target_dataset_create(device->ws,
1122                                                     &rt_dataset_create_info,
1123                                                     &rt_dataset->ws_rt_dataset);
1124    if (result != VK_SUCCESS)
1125       goto err_pvr_rt_datas_fini;
1126 
1127    *rt_dataset_out = rt_dataset;
1128 
1129    return VK_SUCCESS;
1130 
1131 err_pvr_rt_datas_fini:
1132    pvr_rt_datas_fini(rt_dataset);
1133 
1134 err_pvr_rt_tpc_data_fini:
1135    pvr_rt_tpc_data_fini(rt_dataset);
1136 
1137 err_pvr_rt_vheap_rtc_data_fini:
1138    pvr_rt_vheap_rtc_data_fini(rt_dataset);
1139 
1140 err_pvr_free_list_destroy:
1141    pvr_free_list_destroy(rt_dataset->local_free_list);
1142 
1143 err_vk_free_rt_dataset:
1144    vk_free(&device->vk.alloc, rt_dataset);
1145 
1146    return result;
1147 }
1148 
pvr_render_target_dataset_destroy(struct pvr_rt_dataset * rt_dataset)1149 void pvr_render_target_dataset_destroy(struct pvr_rt_dataset *rt_dataset)
1150 {
1151    struct pvr_device *device = rt_dataset->device;
1152 
1153    device->ws->ops->render_target_dataset_destroy(rt_dataset->ws_rt_dataset);
1154 
1155    pvr_rt_datas_fini(rt_dataset);
1156    pvr_rt_tpc_data_fini(rt_dataset);
1157    pvr_rt_vheap_rtc_data_fini(rt_dataset);
1158 
1159    pvr_free_list_destroy(rt_dataset->local_free_list);
1160 
1161    vk_free(&device->vk.alloc, rt_dataset);
1162 }
1163 
1164 static void
pvr_render_job_ws_geometry_state_init(struct pvr_render_ctx * ctx,struct pvr_render_job * job,struct pvr_winsys_geometry_state * state)1165 pvr_render_job_ws_geometry_state_init(struct pvr_render_ctx *ctx,
1166                                       struct pvr_render_job *job,
1167                                       struct pvr_winsys_geometry_state *state)
1168 {
1169    const struct pvr_device_info *dev_info = &ctx->device->pdevice->dev_info;
1170 
1171    /* FIXME: Should this just be done unconditionally? The firmware will just
1172     * ignore the value anyway.
1173     */
1174    if (PVR_HAS_QUIRK(dev_info, 56279)) {
1175       pvr_csb_pack (&state->regs.pds_ctrl, CR_PDS_CTRL, value) {
1176          value.max_num_vdm_tasks = rogue_get_max_num_vdm_pds_tasks(dev_info);
1177       }
1178    } else {
1179       state->regs.pds_ctrl = 0;
1180    }
1181 
1182    pvr_csb_pack (&state->regs.ppp_ctrl, CR_PPP_CTRL, value) {
1183       value.wclampen = true;
1184       value.fixed_point_format = 1;
1185    }
1186 
1187    pvr_csb_pack (&state->regs.te_psg, CR_TE_PSG, value) {
1188       value.completeonterminate = job->geometry_terminate;
1189 
1190       value.region_stride = job->rt_dataset->rgn_headers_stride /
1191                             PVRX(CR_TE_PSG_REGION_STRIDE_UNIT_SIZE);
1192 
1193       value.forcenewstate = PVR_HAS_QUIRK(dev_info, 52942);
1194    }
1195 
1196    /* The set up of CR_TPU must be identical to
1197     * pvr_render_job_ws_fragment_state_init().
1198     */
1199    pvr_csb_pack (&state->regs.tpu, CR_TPU, value) {
1200       value.tag_cem_4k_face_packing = true;
1201    }
1202 
1203    pvr_csb_pack (&state->regs.tpu_border_colour_table,
1204                  CR_TPU_BORDER_COLOUR_TABLE_VDM,
1205                  value) {
1206       value.border_colour_table_address = job->border_colour_table_addr;
1207    }
1208 
1209    pvr_csb_pack (&state->regs.vdm_ctrl_stream_base,
1210                  CR_VDM_CTRL_STREAM_BASE,
1211                  value) {
1212       value.addr = job->ctrl_stream_addr;
1213    }
1214 
1215    /* Set up the USC common size for the context switch resume/load program
1216     * (ctx->ctx_switch.programs[i].sr->pds_load_program), which was created
1217     * as part of the render context.
1218     */
1219    pvr_csb_pack (&state->regs.vdm_ctx_resume_task0_size,
1220                  VDMCTRL_PDS_STATE0,
1221                  value) {
1222       /* Calculate the size in bytes. */
1223       const uint16_t shared_registers_size = job->max_shared_registers * 4;
1224 
1225       value.usc_common_size =
1226          DIV_ROUND_UP(shared_registers_size,
1227                       PVRX(VDMCTRL_PDS_STATE0_USC_COMMON_SIZE_UNIT_SIZE));
1228    };
1229 
1230    state->flags = 0;
1231 
1232    if (!job->rt_dataset->need_frag)
1233       state->flags |= PVR_WINSYS_GEOM_FLAG_FIRST_GEOMETRY;
1234 
1235    if (job->geometry_terminate)
1236       state->flags |= PVR_WINSYS_GEOM_FLAG_LAST_GEOMETRY;
1237 
1238    if (job->frag_uses_atomic_ops)
1239       state->flags |= PVR_WINSYS_GEOM_FLAG_SINGLE_CORE;
1240 }
1241 
1242 static inline void
pvr_get_isp_num_tiles_xy(const struct pvr_device_info * dev_info,uint32_t samples,uint32_t width,uint32_t height,uint32_t * const x_out,uint32_t * const y_out)1243 pvr_get_isp_num_tiles_xy(const struct pvr_device_info *dev_info,
1244                          uint32_t samples,
1245                          uint32_t width,
1246                          uint32_t height,
1247                          uint32_t *const x_out,
1248                          uint32_t *const y_out)
1249 {
1250    uint32_t tile_samples_x;
1251    uint32_t tile_samples_y;
1252    uint32_t scale_x;
1253    uint32_t scale_y;
1254 
1255    rogue_get_isp_samples_per_tile_xy(dev_info,
1256                                      samples,
1257                                      &tile_samples_x,
1258                                      &tile_samples_y);
1259 
1260    switch (samples) {
1261    case 1:
1262       scale_x = 1;
1263       scale_y = 1;
1264       break;
1265    case 2:
1266       scale_x = 1;
1267       scale_y = 2;
1268       break;
1269    case 4:
1270       scale_x = 2;
1271       scale_y = 2;
1272       break;
1273    case 8:
1274       scale_x = 2;
1275       scale_y = 4;
1276       break;
1277    default:
1278       unreachable("Unsupported number of samples");
1279    }
1280 
1281    *x_out = DIV_ROUND_UP(width * scale_x, tile_samples_x);
1282    *y_out = DIV_ROUND_UP(height * scale_y, tile_samples_y);
1283 
1284    if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
1285       assert(PVR_GET_FEATURE_VALUE(dev_info,
1286                                    simple_parameter_format_version,
1287                                    0U) == 2U);
1288       /* Align to a 2x2 tile block. */
1289       *x_out = ALIGN_POT(*x_out, 2);
1290       *y_out = ALIGN_POT(*y_out, 2);
1291    }
1292 }
1293 
1294 static void
pvr_render_job_ws_fragment_state_init(struct pvr_render_ctx * ctx,struct pvr_render_job * job,struct pvr_winsys_fragment_state * state)1295 pvr_render_job_ws_fragment_state_init(struct pvr_render_ctx *ctx,
1296                                       struct pvr_render_job *job,
1297                                       struct pvr_winsys_fragment_state *state)
1298 {
1299    const enum PVRX(CR_ISP_AA_MODE_TYPE)
1300       isp_aa_mode = pvr_cr_isp_aa_mode_type(job->samples);
1301    const struct pvr_device_runtime_info *dev_runtime_info =
1302       &ctx->device->pdevice->dev_runtime_info;
1303    const struct pvr_device_info *dev_info = &ctx->device->pdevice->dev_info;
1304    uint32_t isp_ctl;
1305 
1306    /* FIXME: what to do when job->run_frag is false? */
1307 
1308    /* FIXME: pass in the number of samples rather than isp_aa_mode? */
1309    pvr_setup_tiles_in_flight(dev_info,
1310                              dev_runtime_info,
1311                              isp_aa_mode,
1312                              job->pixel_output_width,
1313                              false,
1314                              job->max_tiles_in_flight,
1315                              &isp_ctl,
1316                              &state->regs.usc_pixel_output_ctrl);
1317 
1318    pvr_csb_pack (&state->regs.isp_ctl, CR_ISP_CTL, value) {
1319       value.sample_pos = true;
1320 
1321       /* FIXME: There are a number of things that cause this to be set, this
1322        * is just one of them.
1323        */
1324       value.process_empty_tiles = job->process_empty_tiles;
1325    }
1326 
1327    /* FIXME: When pvr_setup_tiles_in_flight() is refactored it might be
1328     * possible to fully pack CR_ISP_CTL above rather than having to OR in part
1329     * of the value.
1330     */
1331    state->regs.isp_ctl |= isp_ctl;
1332 
1333    pvr_csb_pack (&state->regs.isp_aa, CR_ISP_AA, value) {
1334       value.mode = isp_aa_mode;
1335    }
1336 
1337    /* The set up of CR_TPU must be identical to
1338     * pvr_render_job_ws_geometry_state_init().
1339     */
1340    pvr_csb_pack (&state->regs.tpu, CR_TPU, value) {
1341       value.tag_cem_4k_face_packing = true;
1342    }
1343 
1344    if (PVR_HAS_FEATURE(dev_info, cluster_grouping) &&
1345        PVR_HAS_FEATURE(dev_info, slc_mcu_cache_controls) &&
1346        dev_runtime_info->num_phantoms > 1 && job->frag_uses_atomic_ops) {
1347       /* Each phantom has its own MCU, so atomicity can only be guaranteed
1348        * when all work items are processed on the same phantom. This means we
1349        * need to disable all USCs other than those of the first phantom, which
1350        * has 4 clusters. Note that we only need to do this for atomic
1351        * operations in fragment shaders, since hardware prevents the TA to run
1352        * on more than one phantom anyway.
1353        */
1354       state->regs.pixel_phantom = 0xF;
1355    } else {
1356       state->regs.pixel_phantom = 0;
1357    }
1358 
1359    pvr_csb_pack (&state->regs.isp_bgobjvals, CR_ISP_BGOBJVALS, value) {
1360       value.enablebgtag = job->enable_bg_tag;
1361 
1362       value.mask = true;
1363 
1364       /* FIXME: Hard code this for now as we don't currently support any
1365        * stencil image formats.
1366        */
1367       value.stencil = 0xFF;
1368    }
1369 
1370    pvr_csb_pack (&state->regs.isp_bgobjdepth, CR_ISP_BGOBJDEPTH, value) {
1371       /* FIXME: This is suitable for the single depth format the driver
1372        * currently supports, but may need updating to handle other depth
1373        * formats.
1374        */
1375       value.value = fui(job->depth_clear_value);
1376    }
1377 
1378    /* FIXME: Some additional set up needed to support depth and stencil
1379     * load/store operations.
1380     */
1381    pvr_csb_pack (&state->regs.isp_zlsctl, CR_ISP_ZLSCTL, value) {
1382       uint32_t aligned_width =
1383          ALIGN_POT(job->depth_physical_width, ROGUE_IPF_TILE_SIZE_PIXELS);
1384       uint32_t aligned_height =
1385          ALIGN_POT(job->depth_physical_height, ROGUE_IPF_TILE_SIZE_PIXELS);
1386 
1387       pvr_get_isp_num_tiles_xy(dev_info,
1388                                job->samples,
1389                                aligned_width,
1390                                aligned_height,
1391                                &value.zlsextent_x_z,
1392                                &value.zlsextent_y_z);
1393       value.zlsextent_x_z -= 1;
1394       value.zlsextent_y_z -= 1;
1395 
1396       if (job->depth_memlayout == PVR_MEMLAYOUT_TWIDDLED) {
1397          value.loadtwiddled = true;
1398          value.storetwiddled = true;
1399       }
1400 
1401       /* FIXME: This is suitable for the single depth format the driver
1402        * currently supports, but may need updating to handle other depth
1403        * formats.
1404        */
1405       assert(job->depth_vk_format == VK_FORMAT_D32_SFLOAT);
1406       value.zloadformat = PVRX(CR_ZLOADFORMAT_TYPE_F32Z);
1407       value.zstoreformat = PVRX(CR_ZSTOREFORMAT_TYPE_F32Z);
1408    }
1409 
1410    if (PVR_HAS_FEATURE(dev_info, zls_subtile)) {
1411       pvr_csb_pack (&state->regs.isp_zls_pixels, CR_ISP_ZLS_PIXELS, value) {
1412          value.x = job->depth_stride - 1;
1413          value.y = job->depth_height - 1;
1414       }
1415    } else {
1416       state->regs.isp_zls_pixels = 0;
1417    }
1418 
1419    pvr_csb_pack (&state->regs.isp_zload_store_base, CR_ISP_ZLOAD_BASE, value) {
1420       value.addr = job->depth_addr;
1421    }
1422 
1423    pvr_csb_pack (&state->regs.isp_stencil_load_store_base,
1424                  CR_ISP_STENCIL_LOAD_BASE,
1425                  value) {
1426       value.addr = job->stencil_addr;
1427 
1428       /* FIXME: May need to set value.enable to true. */
1429    }
1430 
1431    pvr_csb_pack (&state->regs.tpu_border_colour_table,
1432                  CR_TPU_BORDER_COLOUR_TABLE_PDM,
1433                  value) {
1434       value.border_colour_table_address = job->border_colour_table_addr;
1435    }
1436 
1437    state->regs.isp_oclqry_base = 0;
1438 
1439    pvr_csb_pack (&state->regs.isp_dbias_base, CR_ISP_DBIAS_BASE, value) {
1440       value.addr = job->depth_bias_table_addr;
1441    }
1442 
1443    pvr_csb_pack (&state->regs.isp_scissor_base, CR_ISP_SCISSOR_BASE, value) {
1444       value.addr = job->scissor_table_addr;
1445    }
1446 
1447    pvr_csb_pack (&state->regs.event_pixel_pds_info,
1448                  CR_EVENT_PIXEL_PDS_INFO,
1449                  value) {
1450       value.const_size =
1451          DIV_ROUND_UP(ctx->device->pixel_event_data_size_in_dwords,
1452                       PVRX(CR_EVENT_PIXEL_PDS_INFO_CONST_SIZE_UNIT_SIZE));
1453       value.temp_stride = 0;
1454       value.usc_sr_size =
1455          DIV_ROUND_UP(PVR_STATE_PBE_DWORDS,
1456                       PVRX(CR_EVENT_PIXEL_PDS_INFO_USC_SR_SIZE_UNIT_SIZE));
1457    }
1458 
1459    pvr_csb_pack (&state->regs.event_pixel_pds_data,
1460                  CR_EVENT_PIXEL_PDS_DATA,
1461                  value) {
1462       value.addr = PVR_DEV_ADDR(job->pds_pixel_event_data_offset);
1463    }
1464 
1465    STATIC_ASSERT(ARRAY_SIZE(state->regs.pbe_word) ==
1466                  ARRAY_SIZE(job->pbe_reg_words));
1467    STATIC_ASSERT(ARRAY_SIZE(state->regs.pbe_word[0]) ==
1468                  ARRAY_SIZE(job->pbe_reg_words[0]));
1469 
1470    for (uint32_t i = 0; i < ARRAY_SIZE(job->pbe_reg_words); i++) {
1471       state->regs.pbe_word[i][0] = job->pbe_reg_words[i][0];
1472       state->regs.pbe_word[i][1] = job->pbe_reg_words[i][1];
1473       state->regs.pbe_word[i][2] = job->pbe_reg_words[i][2];
1474    }
1475 
1476    STATIC_ASSERT(__same_type(state->regs.pds_bgnd, job->pds_bgnd_reg_values));
1477    typed_memcpy(state->regs.pds_bgnd,
1478                 job->pds_bgnd_reg_values,
1479                 ARRAY_SIZE(state->regs.pds_bgnd));
1480 
1481    memset(state->regs.pds_pr_bgnd, 0, sizeof(state->regs.pds_pr_bgnd));
1482 
1483    /* FIXME: Merge geometry and fragment flags into a single flags member? */
1484    /* FIXME: move to its own function? */
1485    state->flags = 0;
1486 
1487    if (job->depth_addr.addr)
1488       state->flags |= PVR_WINSYS_FRAG_FLAG_DEPTH_BUFFER_PRESENT;
1489 
1490    if (job->stencil_addr.addr)
1491       state->flags |= PVR_WINSYS_FRAG_FLAG_STENCIL_BUFFER_PRESENT;
1492 
1493    if (job->disable_compute_overlap)
1494       state->flags |= PVR_WINSYS_FRAG_FLAG_PREVENT_CDM_OVERLAP;
1495 
1496    if (job->frag_uses_atomic_ops)
1497       state->flags |= PVR_WINSYS_FRAG_FLAG_SINGLE_CORE;
1498 
1499    state->zls_stride = job->depth_layer_size;
1500    state->sls_stride = job->depth_layer_size;
1501 }
1502 
pvr_render_job_ws_submit_info_init(struct pvr_render_ctx * ctx,struct pvr_render_job * job,const struct pvr_winsys_job_bo * bos,uint32_t bo_count,struct vk_sync ** waits,uint32_t wait_count,uint32_t * stage_flags,struct pvr_winsys_render_submit_info * submit_info)1503 static void pvr_render_job_ws_submit_info_init(
1504    struct pvr_render_ctx *ctx,
1505    struct pvr_render_job *job,
1506    const struct pvr_winsys_job_bo *bos,
1507    uint32_t bo_count,
1508    struct vk_sync **waits,
1509    uint32_t wait_count,
1510    uint32_t *stage_flags,
1511    struct pvr_winsys_render_submit_info *submit_info)
1512 {
1513    memset(submit_info, 0, sizeof(*submit_info));
1514 
1515    submit_info->rt_dataset = job->rt_dataset->ws_rt_dataset;
1516    submit_info->rt_data_idx = job->rt_dataset->rt_data_idx;
1517 
1518    submit_info->frame_num = ctx->device->global_queue_present_count;
1519    submit_info->job_num = ctx->device->global_queue_job_count;
1520 
1521    submit_info->run_frag = job->run_frag;
1522 
1523    submit_info->bos = bos;
1524    submit_info->bo_count = bo_count;
1525 
1526    submit_info->waits = waits;
1527    submit_info->wait_count = wait_count;
1528    submit_info->stage_flags = stage_flags;
1529 
1530    /* FIXME: add WSI image bos. */
1531 
1532    pvr_render_job_ws_geometry_state_init(ctx, job, &submit_info->geometry);
1533    pvr_render_job_ws_fragment_state_init(ctx, job, &submit_info->fragment);
1534 
1535    /* These values are expected to match. */
1536    assert(submit_info->geometry.regs.tpu == submit_info->fragment.regs.tpu);
1537 }
1538 
pvr_render_job_submit(struct pvr_render_ctx * ctx,struct pvr_render_job * job,const struct pvr_winsys_job_bo * bos,uint32_t bo_count,struct vk_sync ** waits,uint32_t wait_count,uint32_t * stage_flags,struct vk_sync * signal_sync_geom,struct vk_sync * signal_sync_frag)1539 VkResult pvr_render_job_submit(struct pvr_render_ctx *ctx,
1540                                struct pvr_render_job *job,
1541                                const struct pvr_winsys_job_bo *bos,
1542                                uint32_t bo_count,
1543                                struct vk_sync **waits,
1544                                uint32_t wait_count,
1545                                uint32_t *stage_flags,
1546                                struct vk_sync *signal_sync_geom,
1547                                struct vk_sync *signal_sync_frag)
1548 {
1549    struct pvr_rt_dataset *rt_dataset = job->rt_dataset;
1550    struct pvr_winsys_render_submit_info submit_info;
1551    struct pvr_device *device = ctx->device;
1552    VkResult result;
1553 
1554    pvr_render_job_ws_submit_info_init(ctx,
1555                                       job,
1556                                       bos,
1557                                       bo_count,
1558                                       waits,
1559                                       wait_count,
1560                                       stage_flags,
1561                                       &submit_info);
1562 
1563    result = device->ws->ops->render_submit(ctx->ws_ctx,
1564                                            &submit_info,
1565                                            signal_sync_geom,
1566                                            signal_sync_frag);
1567    if (result != VK_SUCCESS)
1568       return result;
1569 
1570    if (job->run_frag) {
1571       /* Move to the next render target data now that a fragment job has been
1572        * successfully submitted. This will allow the next geometry job to be
1573        * submitted to been run in parallel with it.
1574        */
1575       rt_dataset->rt_data_idx =
1576          (rt_dataset->rt_data_idx + 1) % ARRAY_SIZE(rt_dataset->rt_datas);
1577 
1578       rt_dataset->need_frag = false;
1579    } else {
1580       rt_dataset->need_frag = true;
1581    }
1582 
1583    return VK_SUCCESS;
1584 }
1585