1 /*
2 * Copyright © 2022 Imagination Technologies Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to deal
6 * in the Software without restriction, including without limitation the rights
7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 * copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <stdint.h>
27 #include <vulkan/vulkan.h>
28
29 #include "hwdef/rogue_hw_defs.h"
30 #include "hwdef/rogue_hw_utils.h"
31 #include "pvr_bo.h"
32 #include "pvr_csb.h"
33 #include "pvr_csb_enum_helpers.h"
34 #include "pvr_debug.h"
35 #include "pvr_job_common.h"
36 #include "pvr_job_context.h"
37 #include "pvr_job_render.h"
38 #include "pvr_pds.h"
39 #include "pvr_private.h"
40 #include "pvr_rogue_fw.h"
41 #include "pvr_types.h"
42 #include "pvr_winsys.h"
43 #include "util/compiler.h"
44 #include "util/macros.h"
45 #include "util/u_math.h"
46 #include "vk_alloc.h"
47 #include "vk_log.h"
48 #include "vk_util.h"
49
50 #define ROGUE_BIF_PM_FREELIST_BASE_ADDR_ALIGNSIZE 16U
51
52 /* FIXME: Is there a hardware define we can use instead? */
53 /* 1 DWord per PM physical page stored in the free list */
54 #define ROGUE_FREE_LIST_ENTRY_SIZE ((uint32_t)sizeof(uint32_t))
55
56 /* FIXME: The three defines below, for the number of PC, PD and PT entries in a
57 * 4KB page, come from rgxmmudefs_km.h (meaning they're part of the
58 * auto-generated hwdefs). Should these be defined in rogue_mmu.xml? Keeping in
59 * mind that we probably only need these three values. */
60 #define ROGUE_NUM_PC_ENTRIES_PER_PAGE 0x400U
61
62 #define ROGUE_NUM_PD_ENTRIES_PER_PAGE 0x200U
63
64 #define ROGUE_NUM_PT_ENTRIES_PER_PAGE 0x200U
65
66 struct pvr_free_list {
67 struct pvr_device *device;
68
69 uint64_t size;
70
71 struct pvr_bo *bo;
72
73 struct pvr_winsys_free_list *ws_free_list;
74 };
75
76 /* Macrotile information. */
77 struct pvr_rt_mtile_info {
78 uint32_t tile_size_x;
79 uint32_t tile_size_y;
80
81 uint32_t num_tiles_x;
82 uint32_t num_tiles_y;
83
84 uint32_t tiles_per_mtile_x;
85 uint32_t tiles_per_mtile_y;
86
87 uint32_t x_tile_max;
88 uint32_t y_tile_max;
89
90 uint32_t mtiles_x;
91 uint32_t mtiles_y;
92
93 uint32_t mtile_x1;
94 uint32_t mtile_y1;
95 uint32_t mtile_x2;
96 uint32_t mtile_y2;
97 uint32_t mtile_x3;
98 uint32_t mtile_y3;
99
100 uint32_t mtile_stride;
101 };
102
103 struct pvr_rt_dataset {
104 struct pvr_device *device;
105
106 /* RT dataset information */
107 uint32_t width;
108 uint32_t height;
109 uint32_t samples;
110 uint32_t layers;
111
112 struct pvr_free_list *global_free_list;
113 struct pvr_free_list *local_free_list;
114
115 struct pvr_bo *vheap_rtc_bo;
116 pvr_dev_addr_t vheap_dev_addr;
117 pvr_dev_addr_t rtc_dev_addr;
118
119 struct pvr_bo *tpc_bo;
120 uint64_t tpc_stride;
121 uint64_t tpc_size;
122
123 struct pvr_winsys_rt_dataset *ws_rt_dataset;
124
125 /* RT data information */
126 struct pvr_bo *mta_mlist_bo;
127
128 struct pvr_bo *rgn_headers_bo;
129 uint64_t rgn_headers_stride;
130
131 bool need_frag;
132
133 uint8_t rt_data_idx;
134
135 struct {
136 pvr_dev_addr_t mta_dev_addr;
137 pvr_dev_addr_t mlist_dev_addr;
138 pvr_dev_addr_t rgn_headers_dev_addr;
139 } rt_datas[ROGUE_NUM_RTDATAS];
140 };
141
pvr_free_list_create(struct pvr_device * device,uint32_t initial_size,uint32_t max_size,uint32_t grow_size,uint32_t grow_threshold,struct pvr_free_list * parent_free_list,struct pvr_free_list ** const free_list_out)142 VkResult pvr_free_list_create(struct pvr_device *device,
143 uint32_t initial_size,
144 uint32_t max_size,
145 uint32_t grow_size,
146 uint32_t grow_threshold,
147 struct pvr_free_list *parent_free_list,
148 struct pvr_free_list **const free_list_out)
149 {
150 struct pvr_winsys_free_list *parent_ws_free_list =
151 parent_free_list ? parent_free_list->ws_free_list : NULL;
152 const uint64_t bo_flags = PVR_BO_ALLOC_FLAG_GPU_UNCACHED |
153 PVR_BO_ALLOC_FLAG_PM_FW_PROTECT;
154 struct pvr_free_list *free_list;
155 uint32_t cache_line_size;
156 uint32_t initial_num_pages;
157 uint32_t grow_num_pages;
158 uint32_t max_num_pages;
159 uint64_t addr_alignment;
160 uint64_t size_alignment;
161 uint64_t size;
162 VkResult result;
163
164 assert((initial_size + grow_size) <= max_size);
165 assert(max_size != 0);
166 assert(grow_threshold <= 100);
167
168 /* Make sure the free list is created with at least a single page. */
169 if (initial_size == 0)
170 initial_size = ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE;
171
172 /* The freelists sizes must respect the PM freelist base address alignment
173 * requirement. As the freelist entries are cached by the SLC, it's also
174 * necessary to ensure the sizes respect the SLC cache line size to avoid
175 * invalid entries appearing in the cache, which would be problematic after
176 * a grow operation, as the SLC entries aren't invalidated. We do this by
177 * making sure the freelist values are appropriately aligned.
178 *
179 * To calculate the alignment, we first take the largest of the freelist
180 * base address alignment and the SLC cache line size. We then divide this
181 * by the freelist entry size to determine the number of freelist entries
182 * required by the PM. Finally, as each entry holds a single PM physical
183 * page, we multiple the number of entries by the page size.
184 *
185 * As an example, if the base address alignment is 16 bytes, the SLC cache
186 * line size is 64 bytes and the freelist entry size is 4 bytes then 16
187 * entries are required, as we take the SLC cacheline size (being the larger
188 * of the two values) and divide this by 4. If the PM page size is 4096
189 * bytes then we end up with an alignment of 65536 bytes.
190 */
191 cache_line_size = rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
192
193 addr_alignment =
194 MAX2(ROGUE_BIF_PM_FREELIST_BASE_ADDR_ALIGNSIZE, cache_line_size);
195 size_alignment = (addr_alignment / ROGUE_FREE_LIST_ENTRY_SIZE) *
196 ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE;
197
198 assert(util_is_power_of_two_nonzero(size_alignment));
199
200 initial_size = align64(initial_size, size_alignment);
201 max_size = align64(max_size, size_alignment);
202 grow_size = align64(grow_size, size_alignment);
203
204 /* Make sure the 'max' size doesn't exceed what the firmware supports and
205 * adjust the other sizes accordingly.
206 */
207 if (max_size > ROGUE_FREE_LIST_MAX_SIZE) {
208 max_size = ROGUE_FREE_LIST_MAX_SIZE;
209 assert(align64(max_size, size_alignment) == max_size);
210 }
211
212 if (initial_size > max_size)
213 initial_size = max_size;
214
215 if (initial_size == max_size)
216 grow_size = 0;
217
218 initial_num_pages = initial_size >> ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT;
219 max_num_pages = max_size >> ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT;
220 grow_num_pages = grow_size >> ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT;
221
222 /* Calculate the size of the buffer needed to store the free list entries
223 * based on the maximum number of pages we can have.
224 */
225 size = max_num_pages * ROGUE_FREE_LIST_ENTRY_SIZE;
226 assert(align64(size, addr_alignment) == size);
227
228 free_list = vk_alloc(&device->vk.alloc,
229 sizeof(*free_list),
230 8,
231 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
232 if (!free_list)
233 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
234
235 /* FIXME: The memory is mapped GPU uncached, but this seems to contradict
236 * the comment above about aligning to the SLC cache line size.
237 */
238 result = pvr_bo_alloc(device,
239 device->heaps.general_heap,
240 size,
241 addr_alignment,
242 bo_flags,
243 &free_list->bo);
244 if (result != VK_SUCCESS)
245 goto err_vk_free_free_list;
246
247 result = device->ws->ops->free_list_create(device->ws,
248 free_list->bo->vma,
249 initial_num_pages,
250 max_num_pages,
251 grow_num_pages,
252 grow_threshold,
253 parent_ws_free_list,
254 &free_list->ws_free_list);
255 if (result != VK_SUCCESS)
256 goto err_pvr_bo_free_bo;
257
258 free_list->device = device;
259 free_list->size = size;
260
261 *free_list_out = free_list;
262
263 return VK_SUCCESS;
264
265 err_pvr_bo_free_bo:
266 pvr_bo_free(device, free_list->bo);
267
268 err_vk_free_free_list:
269 vk_free(&device->vk.alloc, free_list);
270
271 return result;
272 }
273
pvr_free_list_destroy(struct pvr_free_list * free_list)274 void pvr_free_list_destroy(struct pvr_free_list *free_list)
275 {
276 struct pvr_device *device = free_list->device;
277
278 device->ws->ops->free_list_destroy(free_list->ws_free_list);
279 pvr_bo_free(device, free_list->bo);
280 vk_free(&device->vk.alloc, free_list);
281 }
282
pvr_get_samples_in_xy(uint32_t samples,uint32_t * const x_out,uint32_t * const y_out)283 static inline void pvr_get_samples_in_xy(uint32_t samples,
284 uint32_t *const x_out,
285 uint32_t *const y_out)
286 {
287 switch (samples) {
288 case 1:
289 *x_out = 1;
290 *y_out = 1;
291 break;
292 case 2:
293 *x_out = 1;
294 *y_out = 2;
295 break;
296 case 4:
297 *x_out = 2;
298 *y_out = 2;
299 break;
300 case 8:
301 *x_out = 2;
302 *y_out = 4;
303 break;
304 default:
305 unreachable("Unsupported number of samples");
306 }
307 }
308
pvr_rt_mtile_info_init(struct pvr_device * device,struct pvr_rt_mtile_info * info,uint32_t width,uint32_t height,uint32_t samples)309 static void pvr_rt_mtile_info_init(struct pvr_device *device,
310 struct pvr_rt_mtile_info *info,
311 uint32_t width,
312 uint32_t height,
313 uint32_t samples)
314 {
315 const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
316 uint32_t samples_in_x;
317 uint32_t samples_in_y;
318
319 pvr_get_samples_in_xy(samples, &samples_in_x, &samples_in_y);
320
321 info->tile_size_x = PVR_GET_FEATURE_VALUE(dev_info, tile_size_x, 1);
322 info->tile_size_y = PVR_GET_FEATURE_VALUE(dev_info, tile_size_y, 1);
323
324 info->num_tiles_x = DIV_ROUND_UP(width, info->tile_size_x);
325 info->num_tiles_y = DIV_ROUND_UP(height, info->tile_size_y);
326
327 rogue_get_num_macrotiles_xy(dev_info, &info->mtiles_x, &info->mtiles_y);
328
329 if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
330 assert(PVR_GET_FEATURE_VALUE(dev_info,
331 simple_parameter_format_version,
332 0) == 2);
333 /* Set up 16 macrotiles with a multiple of 2x2 tiles per macrotile,
334 * which is aligned to a tile group.
335 */
336 info->mtile_x1 = DIV_ROUND_UP(info->num_tiles_x, 8) * 2;
337 info->mtile_y1 = DIV_ROUND_UP(info->num_tiles_y, 8) * 2;
338 info->mtile_x2 = 0;
339 info->mtile_y2 = 0;
340 info->mtile_x3 = 0;
341 info->mtile_y3 = 0;
342 info->x_tile_max = ALIGN_POT(info->num_tiles_x, 2) - 1;
343 info->y_tile_max = ALIGN_POT(info->num_tiles_y, 2) - 1;
344 } else {
345 /* Set up 16 macrotiles with a multiple of 4x4 tiles per macrotile. */
346 info->mtile_x1 = ALIGN_POT(DIV_ROUND_UP(info->num_tiles_x, 4), 4);
347 info->mtile_y1 = ALIGN_POT(DIV_ROUND_UP(info->num_tiles_y, 4), 4);
348 info->mtile_x2 = info->mtile_x1 * 2;
349 info->mtile_y2 = info->mtile_y1 * 2;
350 info->mtile_x3 = info->mtile_x1 * 3;
351 info->mtile_y3 = info->mtile_y1 * 3;
352 info->x_tile_max = info->num_tiles_x - 1;
353 info->y_tile_max = info->num_tiles_y - 1;
354 }
355
356 info->tiles_per_mtile_x = info->mtile_x1 * samples_in_x;
357 info->tiles_per_mtile_y = info->mtile_y1 * samples_in_y;
358
359 info->mtile_stride = info->mtile_x1 * info->mtile_y1;
360 }
361
362 /* Note that the unit of the return value depends on the GPU. For cores with the
363 * simple_internal_parameter_format feature the returned size is interpreted as
364 * the number of region headers. For cores without this feature its interpreted
365 * as the size in dwords.
366 */
367 static uint64_t
pvr_rt_get_isp_region_size(struct pvr_device * device,const struct pvr_rt_mtile_info * mtile_info)368 pvr_rt_get_isp_region_size(struct pvr_device *device,
369 const struct pvr_rt_mtile_info *mtile_info)
370 {
371 const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
372 uint64_t rgn_size =
373 mtile_info->tiles_per_mtile_x * mtile_info->tiles_per_mtile_y;
374
375 if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
376 uint32_t version;
377
378 rgn_size *= mtile_info->mtiles_x * mtile_info->mtiles_y;
379
380 if (PVR_FEATURE_VALUE(dev_info,
381 simple_parameter_format_version,
382 &version)) {
383 version = 0;
384 }
385
386 if (version == 2) {
387 /* One region header per 2x2 tile group. */
388 rgn_size /= (2U * 2U);
389 }
390 } else {
391 const uint64_t rgn_header_size = rogue_get_region_header_size(dev_info);
392
393 /* Round up to next dword to prevent IPF overrun and convert to bytes.
394 */
395 rgn_size = DIV_ROUND_UP(rgn_size * rgn_header_size, 4);
396 }
397
398 return rgn_size;
399 }
400
pvr_rt_vheap_rtc_data_init(struct pvr_device * device,struct pvr_rt_dataset * rt_dataset,uint32_t layers)401 static VkResult pvr_rt_vheap_rtc_data_init(struct pvr_device *device,
402 struct pvr_rt_dataset *rt_dataset,
403 uint32_t layers)
404 {
405 const uint64_t bo_flags = PVR_BO_ALLOC_FLAG_GPU_UNCACHED |
406 PVR_BO_ALLOC_FLAG_ZERO_ON_ALLOC;
407 uint64_t vheap_size;
408 uint32_t alignment;
409 uint64_t rtc_size;
410 VkResult result;
411
412 vheap_size = ROGUE_CR_PM_VHEAP_TABLE_SIZE * ROGUE_PM_VHEAP_ENTRY_SIZE;
413
414 if (layers > 1) {
415 uint64_t rtc_entries;
416
417 vheap_size = ALIGN_POT(vheap_size, PVRX(CR_TA_RTC_ADDR_BASE_ALIGNMENT));
418
419 rtc_entries = ROGUE_NUM_TEAC + ROGUE_NUM_TE + ROGUE_NUM_VCE;
420 if (PVR_HAS_QUIRK(&device->pdevice->dev_info, 48545))
421 rtc_entries += ROGUE_NUM_TE;
422
423 rtc_size = rtc_entries * ROGUE_RTC_SIZE_IN_BYTES;
424 } else {
425 rtc_size = 0;
426 }
427
428 alignment = MAX2(PVRX(CR_PM_VHEAP_TABLE_BASE_ADDR_ALIGNMENT),
429 PVRX(CR_TA_RTC_ADDR_BASE_ALIGNMENT));
430
431 result = pvr_bo_alloc(device,
432 device->heaps.general_heap,
433 vheap_size + rtc_size,
434 alignment,
435 bo_flags,
436 &rt_dataset->vheap_rtc_bo);
437 if (result != VK_SUCCESS)
438 return result;
439
440 rt_dataset->vheap_dev_addr = rt_dataset->vheap_rtc_bo->vma->dev_addr;
441
442 if (rtc_size > 0) {
443 rt_dataset->rtc_dev_addr =
444 PVR_DEV_ADDR_OFFSET(rt_dataset->vheap_dev_addr, vheap_size);
445 } else {
446 rt_dataset->rtc_dev_addr = PVR_DEV_ADDR_INVALID;
447 }
448
449 return VK_SUCCESS;
450 }
451
pvr_rt_vheap_rtc_data_fini(struct pvr_rt_dataset * rt_dataset)452 static void pvr_rt_vheap_rtc_data_fini(struct pvr_rt_dataset *rt_dataset)
453 {
454 rt_dataset->rtc_dev_addr = PVR_DEV_ADDR_INVALID;
455
456 pvr_bo_free(rt_dataset->device, rt_dataset->vheap_rtc_bo);
457 rt_dataset->vheap_rtc_bo = NULL;
458 }
459
460 static void
pvr_rt_get_tail_ptr_stride_size(const struct pvr_device * device,const struct pvr_rt_mtile_info * mtile_info,uint32_t layers,uint64_t * const stride_out,uint64_t * const size_out)461 pvr_rt_get_tail_ptr_stride_size(const struct pvr_device *device,
462 const struct pvr_rt_mtile_info *mtile_info,
463 uint32_t layers,
464 uint64_t *const stride_out,
465 uint64_t *const size_out)
466 {
467 uint32_t max_num_mtiles;
468 uint32_t num_mtiles_x;
469 uint32_t num_mtiles_y;
470 uint32_t version;
471 uint64_t size;
472
473 num_mtiles_x = mtile_info->mtiles_x * mtile_info->tiles_per_mtile_x;
474 num_mtiles_y = mtile_info->mtiles_y * mtile_info->tiles_per_mtile_y;
475
476 max_num_mtiles = MAX2(util_next_power_of_two64(num_mtiles_x),
477 util_next_power_of_two64(num_mtiles_y));
478
479 size = max_num_mtiles * max_num_mtiles;
480
481 if (PVR_FEATURE_VALUE(&device->pdevice->dev_info,
482 simple_parameter_format_version,
483 &version)) {
484 version = 0;
485 }
486
487 if (version == 2) {
488 /* One tail pointer cache entry per 2x2 tile group. */
489 size /= (2U * 2U);
490 }
491
492 size *= ROGUE_TAIL_POINTER_SIZE;
493
494 if (layers > 1) {
495 size = ALIGN_POT(size, ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE);
496
497 *stride_out = size / ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE;
498 *size_out = size * layers;
499 } else {
500 *stride_out = 0;
501 *size_out = size;
502 }
503 }
504
pvr_rt_tpc_data_init(struct pvr_device * device,struct pvr_rt_dataset * rt_dataset,const struct pvr_rt_mtile_info * mtile_info,uint32_t layers)505 static VkResult pvr_rt_tpc_data_init(struct pvr_device *device,
506 struct pvr_rt_dataset *rt_dataset,
507 const struct pvr_rt_mtile_info *mtile_info,
508 uint32_t layers)
509 {
510 const uint64_t bo_flags = PVR_BO_ALLOC_FLAG_GPU_UNCACHED |
511 PVR_BO_ALLOC_FLAG_ZERO_ON_ALLOC;
512 uint64_t tpc_size;
513
514 pvr_rt_get_tail_ptr_stride_size(device,
515 mtile_info,
516 layers,
517 &rt_dataset->tpc_stride,
518 &rt_dataset->tpc_size);
519 tpc_size = ALIGN_POT(rt_dataset->tpc_size, ROGUE_TE_TPC_CACHE_LINE_SIZE);
520
521 return pvr_bo_alloc(device,
522 device->heaps.general_heap,
523 tpc_size,
524 PVRX(CR_TE_TPC_ADDR_BASE_ALIGNMENT),
525 bo_flags,
526 &rt_dataset->tpc_bo);
527 }
528
pvr_rt_tpc_data_fini(struct pvr_rt_dataset * rt_dataset)529 static void pvr_rt_tpc_data_fini(struct pvr_rt_dataset *rt_dataset)
530 {
531 pvr_bo_free(rt_dataset->device, rt_dataset->tpc_bo);
532 rt_dataset->tpc_bo = NULL;
533 }
534
535 static uint32_t
pvr_rt_get_mlist_size(const struct pvr_free_list * global_free_list,const struct pvr_free_list * local_free_list)536 pvr_rt_get_mlist_size(const struct pvr_free_list *global_free_list,
537 const struct pvr_free_list *local_free_list)
538 {
539 uint32_t num_pte_pages;
540 uint32_t num_pde_pages;
541 uint32_t num_pce_pages;
542 uint64_t total_pages;
543 uint32_t mlist_size;
544
545 assert(global_free_list->size + local_free_list->size <=
546 ROGUE_PM_MAX_PB_VIRT_ADDR_SPACE);
547
548 total_pages = (global_free_list->size + local_free_list->size) >>
549 ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT;
550
551 /* Calculate the total number of physical pages required to hold the page
552 * table, directory and catalog entries for the freelist pages.
553 */
554 num_pte_pages = DIV_ROUND_UP(total_pages, ROGUE_NUM_PT_ENTRIES_PER_PAGE);
555 num_pde_pages = DIV_ROUND_UP(num_pte_pages, ROGUE_NUM_PD_ENTRIES_PER_PAGE);
556 num_pce_pages = DIV_ROUND_UP(num_pde_pages, ROGUE_NUM_PC_ENTRIES_PER_PAGE);
557
558 /* Calculate the MList size considering the total number of pages in the PB
559 * are shared among all the PM address spaces.
560 */
561 mlist_size = (num_pce_pages + num_pde_pages + num_pte_pages) *
562 ROGUE_NUM_PM_ADDRESS_SPACES * ROGUE_MLIST_ENTRY_STRIDE;
563
564 return ALIGN_POT(mlist_size, ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE);
565 }
566
pvr_rt_get_region_headers_stride_size(const struct pvr_device * device,const struct pvr_rt_mtile_info * mtile_info,uint32_t layers,uint64_t * const stride_out,uint64_t * const size_out)567 static void pvr_rt_get_region_headers_stride_size(
568 const struct pvr_device *device,
569 const struct pvr_rt_mtile_info *mtile_info,
570 uint32_t layers,
571 uint64_t *const stride_out,
572 uint64_t *const size_out)
573 {
574 const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
575 const uint32_t rgn_header_size = rogue_get_region_header_size(dev_info);
576 uint32_t rgn_headers_size;
577 uint32_t num_tiles_x;
578 uint32_t num_tiles_y;
579 uint32_t group_size;
580 uint32_t version;
581
582 if (PVR_FEATURE_VALUE(dev_info, simple_parameter_format_version, &version))
583 version = 0;
584
585 group_size = version == 2 ? 2 : 1;
586
587 num_tiles_x = mtile_info->mtiles_x * mtile_info->tiles_per_mtile_x;
588 num_tiles_y = mtile_info->mtiles_y * mtile_info->tiles_per_mtile_y;
589
590 rgn_headers_size =
591 (num_tiles_x / group_size) * (num_tiles_y / group_size) * rgn_header_size;
592
593 if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
594 rgn_headers_size =
595 ALIGN_POT(rgn_headers_size, PVRX(CR_TE_PSGREGION_ADDR_BASE_ALIGNMENT));
596 }
597
598 if (layers > 1) {
599 rgn_headers_size =
600 ALIGN_POT(rgn_headers_size, PVRX(CR_TE_PSG_REGION_STRIDE_UNIT_SIZE));
601 }
602
603 *stride_out = rgn_header_size;
604 *size_out = rgn_headers_size * layers;
605 }
606
607 static VkResult
pvr_rt_mta_mlist_data_init(struct pvr_device * device,struct pvr_rt_dataset * rt_dataset,const struct pvr_free_list * global_free_list,const struct pvr_free_list * local_free_list,const struct pvr_rt_mtile_info * mtile_info)608 pvr_rt_mta_mlist_data_init(struct pvr_device *device,
609 struct pvr_rt_dataset *rt_dataset,
610 const struct pvr_free_list *global_free_list,
611 const struct pvr_free_list *local_free_list,
612 const struct pvr_rt_mtile_info *mtile_info)
613 {
614 const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
615 const uint32_t mlist_size =
616 pvr_rt_get_mlist_size(global_free_list, local_free_list);
617 uint32_t mta_size = rogue_get_macrotile_array_size(dev_info);
618 const uint32_t num_rt_datas = ARRAY_SIZE(rt_dataset->rt_datas);
619 uint32_t rt_datas_mlist_size;
620 uint32_t rt_datas_mta_size;
621 pvr_dev_addr_t dev_addr;
622 VkResult result;
623
624 /* Allocate memory for macrotile array and Mlist for all RT datas.
625 *
626 * Allocation layout: MTA[0..N] + Mlist alignment padding + Mlist[0..N].
627 *
628 * N is number of RT datas.
629 */
630 rt_datas_mta_size = ALIGN_POT(mta_size * num_rt_datas,
631 PVRX(CR_PM_MLIST0_BASE_ADDR_ALIGNMENT));
632 rt_datas_mlist_size = mlist_size * num_rt_datas;
633
634 result = pvr_bo_alloc(device,
635 device->heaps.general_heap,
636 rt_datas_mta_size + rt_datas_mlist_size,
637 PVRX(CR_PM_MTILE_ARRAY_BASE_ADDR_ALIGNMENT),
638 PVR_BO_ALLOC_FLAG_GPU_UNCACHED,
639 &rt_dataset->mta_mlist_bo);
640 if (result != VK_SUCCESS)
641 return result;
642
643 dev_addr = rt_dataset->mta_mlist_bo->vma->dev_addr;
644
645 for (uint32_t i = 0; i < num_rt_datas; i++) {
646 if (mta_size != 0) {
647 rt_dataset->rt_datas[i].mta_dev_addr = dev_addr;
648 dev_addr = PVR_DEV_ADDR_OFFSET(dev_addr, mta_size);
649 } else {
650 rt_dataset->rt_datas[i].mta_dev_addr = PVR_DEV_ADDR_INVALID;
651 }
652 }
653
654 dev_addr = PVR_DEV_ADDR_OFFSET(rt_dataset->mta_mlist_bo->vma->dev_addr,
655 rt_datas_mta_size);
656
657 for (uint32_t i = 0; i < num_rt_datas; i++) {
658 if (mlist_size != 0) {
659 rt_dataset->rt_datas[i].mlist_dev_addr = dev_addr;
660 dev_addr = PVR_DEV_ADDR_OFFSET(dev_addr, mlist_size);
661 } else {
662 rt_dataset->rt_datas[i].mlist_dev_addr = PVR_DEV_ADDR_INVALID;
663 }
664 }
665
666 return VK_SUCCESS;
667 }
668
pvr_rt_mta_mlist_data_fini(struct pvr_rt_dataset * rt_dataset)669 static void pvr_rt_mta_mlist_data_fini(struct pvr_rt_dataset *rt_dataset)
670 {
671 for (uint32_t i = 0; i < ARRAY_SIZE(rt_dataset->rt_datas); i++) {
672 rt_dataset->rt_datas[i].mlist_dev_addr = PVR_DEV_ADDR_INVALID;
673 rt_dataset->rt_datas[i].mta_dev_addr = PVR_DEV_ADDR_INVALID;
674 }
675
676 pvr_bo_free(rt_dataset->device, rt_dataset->mta_mlist_bo);
677 rt_dataset->mta_mlist_bo = NULL;
678 }
679
680 static VkResult
pvr_rt_rgn_headers_data_init(struct pvr_device * device,struct pvr_rt_dataset * rt_dataset,const struct pvr_rt_mtile_info * mtile_info,uint32_t layers)681 pvr_rt_rgn_headers_data_init(struct pvr_device *device,
682 struct pvr_rt_dataset *rt_dataset,
683 const struct pvr_rt_mtile_info *mtile_info,
684 uint32_t layers)
685 {
686 const uint32_t num_rt_datas = ARRAY_SIZE(rt_dataset->rt_datas);
687 uint64_t rgn_headers_size;
688 pvr_dev_addr_t dev_addr;
689 VkResult result;
690
691 pvr_rt_get_region_headers_stride_size(device,
692 mtile_info,
693 layers,
694 &rt_dataset->rgn_headers_stride,
695 &rgn_headers_size);
696
697 result = pvr_bo_alloc(device,
698 device->heaps.rgn_hdr_heap,
699 rgn_headers_size * num_rt_datas,
700 PVRX(CR_TE_PSGREGION_ADDR_BASE_ALIGNMENT),
701 PVR_BO_ALLOC_FLAG_GPU_UNCACHED,
702 &rt_dataset->rgn_headers_bo);
703 if (result != VK_SUCCESS)
704 return result;
705
706 dev_addr = rt_dataset->rgn_headers_bo->vma->dev_addr;
707
708 for (uint32_t i = 0; i < num_rt_datas; i++) {
709 rt_dataset->rt_datas[i].rgn_headers_dev_addr = dev_addr;
710 dev_addr = PVR_DEV_ADDR_OFFSET(dev_addr, rgn_headers_size);
711 }
712
713 return VK_SUCCESS;
714 }
715
pvr_rt_rgn_headers_data_fini(struct pvr_rt_dataset * rt_dataset)716 static void pvr_rt_rgn_headers_data_fini(struct pvr_rt_dataset *rt_dataset)
717 {
718 for (uint32_t i = 0; i < ARRAY_SIZE(rt_dataset->rt_datas); i++)
719 rt_dataset->rt_datas[i].rgn_headers_dev_addr = PVR_DEV_ADDR_INVALID;
720
721 pvr_bo_free(rt_dataset->device, rt_dataset->rgn_headers_bo);
722 rt_dataset->rgn_headers_bo = NULL;
723 }
724
pvr_rt_datas_init(struct pvr_device * device,struct pvr_rt_dataset * rt_dataset,const struct pvr_free_list * global_free_list,const struct pvr_free_list * local_free_list,const struct pvr_rt_mtile_info * mtile_info,uint32_t layers)725 static VkResult pvr_rt_datas_init(struct pvr_device *device,
726 struct pvr_rt_dataset *rt_dataset,
727 const struct pvr_free_list *global_free_list,
728 const struct pvr_free_list *local_free_list,
729 const struct pvr_rt_mtile_info *mtile_info,
730 uint32_t layers)
731 {
732 VkResult result;
733
734 result = pvr_rt_mta_mlist_data_init(device,
735 rt_dataset,
736 global_free_list,
737 local_free_list,
738 mtile_info);
739 if (result != VK_SUCCESS)
740 return result;
741
742 result =
743 pvr_rt_rgn_headers_data_init(device, rt_dataset, mtile_info, layers);
744 if (result != VK_SUCCESS)
745 goto err_pvr_rt_mta_mlist_data_fini;
746
747 return VK_SUCCESS;
748
749 err_pvr_rt_mta_mlist_data_fini:
750 pvr_rt_mta_mlist_data_fini(rt_dataset);
751
752 return VK_SUCCESS;
753 }
754
pvr_rt_datas_fini(struct pvr_rt_dataset * rt_dataset)755 static void pvr_rt_datas_fini(struct pvr_rt_dataset *rt_dataset)
756 {
757 pvr_rt_rgn_headers_data_fini(rt_dataset);
758 pvr_rt_mta_mlist_data_fini(rt_dataset);
759 }
760
761 static uint32_t
pvr_rogue_get_cr_isp_mtile_size_val(const struct pvr_device_info * dev_info,uint32_t samples,const struct pvr_rt_mtile_info * mtile_info)762 pvr_rogue_get_cr_isp_mtile_size_val(const struct pvr_device_info *dev_info,
763 uint32_t samples,
764 const struct pvr_rt_mtile_info *mtile_info)
765 {
766 uint32_t samples_per_pixel =
767 PVR_GET_FEATURE_VALUE(dev_info, isp_samples_per_pixel, 0);
768 uint32_t isp_mtile_size;
769
770 pvr_csb_pack (&isp_mtile_size, CR_ISP_MTILE_SIZE, value) {
771 value.x = mtile_info->mtile_x1;
772 value.y = mtile_info->mtile_y1;
773
774 if (samples_per_pixel == 1) {
775 if (samples >= 4)
776 value.x <<= 1;
777
778 if (samples >= 2)
779 value.y <<= 1;
780 } else if (samples_per_pixel == 2) {
781 if (samples >= 8)
782 value.x <<= 1;
783
784 if (samples >= 4)
785 value.y <<= 1;
786 } else if (samples_per_pixel == 4) {
787 if (samples >= 8)
788 value.y <<= 1;
789 } else {
790 assert(!"Unsupported ISP samples per pixel value");
791 }
792 }
793
794 return isp_mtile_size;
795 }
796
pvr_rogue_get_cr_multisamplectl_val(uint32_t samples,bool y_flip)797 static uint64_t pvr_rogue_get_cr_multisamplectl_val(uint32_t samples,
798 bool y_flip)
799 {
800 static const struct {
801 uint8_t x[8];
802 uint8_t y[8];
803 } sample_positions[4] = {
804 /* 1 sample */
805 {
806 .x = { 8 },
807 .y = { 8 },
808 },
809 /* 2 samples */
810 {
811 .x = { 12, 4 },
812 .y = { 12, 4 },
813 },
814 /* 4 samples */
815 {
816 .x = { 6, 14, 2, 10 },
817 .y = { 2, 6, 10, 14 },
818 },
819 /* 8 samples */
820 {
821 .x = { 9, 7, 13, 5, 3, 1, 11, 15 },
822 .y = { 5, 11, 9, 3, 13, 7, 15, 1 },
823 },
824 };
825 uint64_t multisamplectl;
826 uint8_t idx;
827
828 idx = util_fast_log2(samples);
829 assert(idx < ARRAY_SIZE(sample_positions));
830
831 pvr_csb_pack (&multisamplectl, CR_PPP_MULTISAMPLECTL, value) {
832 switch (samples) {
833 case 8:
834 value.msaa_x7 = sample_positions[idx].x[7];
835 value.msaa_x6 = sample_positions[idx].x[6];
836 value.msaa_x5 = sample_positions[idx].x[5];
837 value.msaa_x4 = sample_positions[idx].x[4];
838
839 if (y_flip) {
840 value.msaa_y7 = 16U - sample_positions[idx].y[7];
841 value.msaa_y6 = 16U - sample_positions[idx].y[6];
842 value.msaa_y5 = 16U - sample_positions[idx].y[5];
843 value.msaa_y4 = 16U - sample_positions[idx].y[4];
844 } else {
845 value.msaa_y7 = sample_positions[idx].y[7];
846 value.msaa_y6 = sample_positions[idx].y[6];
847 value.msaa_y5 = sample_positions[idx].y[5];
848 value.msaa_y4 = sample_positions[idx].y[4];
849 }
850
851 FALLTHROUGH;
852 case 4:
853 value.msaa_x3 = sample_positions[idx].x[3];
854 value.msaa_x2 = sample_positions[idx].x[2];
855
856 if (y_flip) {
857 value.msaa_y3 = 16U - sample_positions[idx].y[3];
858 value.msaa_y2 = 16U - sample_positions[idx].y[2];
859 } else {
860 value.msaa_y3 = sample_positions[idx].y[3];
861 value.msaa_y2 = sample_positions[idx].y[2];
862 }
863
864 FALLTHROUGH;
865 case 2:
866 value.msaa_x1 = sample_positions[idx].x[1];
867
868 if (y_flip) {
869 value.msaa_y1 = 16U - sample_positions[idx].y[1];
870 } else {
871 value.msaa_y1 = sample_positions[idx].y[1];
872 }
873
874 FALLTHROUGH;
875 case 1:
876 value.msaa_x0 = sample_positions[idx].x[0];
877
878 if (y_flip) {
879 value.msaa_y0 = 16U - sample_positions[idx].y[0];
880 } else {
881 value.msaa_y0 = sample_positions[idx].y[0];
882 }
883
884 break;
885 default:
886 unreachable("Unsupported number of samples");
887 }
888 }
889
890 return multisamplectl;
891 }
892
893 static uint32_t
pvr_rogue_get_cr_te_aa_val(const struct pvr_device_info * dev_info,uint32_t samples)894 pvr_rogue_get_cr_te_aa_val(const struct pvr_device_info *dev_info,
895 uint32_t samples)
896 {
897 uint32_t samples_per_pixel =
898 PVR_GET_FEATURE_VALUE(dev_info, isp_samples_per_pixel, 0);
899 uint32_t te_aa;
900
901 pvr_csb_pack (&te_aa, CR_TE_AA, value) {
902 if (samples_per_pixel == 1) {
903 if (samples >= 2)
904 value.y = true;
905 if (samples >= 4)
906 value.x = true;
907 } else if (samples_per_pixel == 2) {
908 if (samples >= 2)
909 value.x2 = true;
910 if (samples >= 4)
911 value.y = true;
912 if (samples >= 8)
913 value.x = true;
914 } else if (samples_per_pixel == 4) {
915 if (samples >= 2)
916 value.x2 = true;
917 if (samples >= 4)
918 value.y2 = true;
919 if (samples >= 8)
920 value.y = true;
921 } else {
922 assert(!"Unsupported ISP samples per pixel value");
923 }
924 }
925
926 return te_aa;
927 }
928
pvr_rt_dataset_ws_create_info_init(struct pvr_rt_dataset * rt_dataset,const struct pvr_rt_mtile_info * mtile_info,struct pvr_winsys_rt_dataset_create_info * create_info)929 static void pvr_rt_dataset_ws_create_info_init(
930 struct pvr_rt_dataset *rt_dataset,
931 const struct pvr_rt_mtile_info *mtile_info,
932 struct pvr_winsys_rt_dataset_create_info *create_info)
933 {
934 struct pvr_device *device = rt_dataset->device;
935 const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
936
937 memset(create_info, 0, sizeof(*create_info));
938
939 /* Local freelist. */
940 create_info->local_free_list = rt_dataset->local_free_list->ws_free_list;
941
942 /* ISP register values. */
943 if (PVR_HAS_ERN(dev_info, 42307) &&
944 !(PVR_HAS_FEATURE(dev_info, roguexe) && mtile_info->tile_size_x == 16)) {
945 float value;
946
947 if (rt_dataset->width != 0) {
948 value =
949 ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR / (float)rt_dataset->width;
950 create_info->isp_merge_lower_x = fui(value);
951
952 value =
953 ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR / (float)rt_dataset->width;
954 create_info->isp_merge_upper_x = fui(value);
955 }
956
957 if (rt_dataset->height != 0) {
958 value =
959 ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR / (float)rt_dataset->height;
960 create_info->isp_merge_lower_y = fui(value);
961
962 value =
963 ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR / (float)rt_dataset->height;
964 create_info->isp_merge_upper_y = fui(value);
965 }
966
967 value = ((float)rt_dataset->width * ROGUE_ISP_MERGE_SCALE_FACTOR) /
968 (ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR -
969 ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR);
970 create_info->isp_merge_scale_x = fui(value);
971
972 value = ((float)rt_dataset->height * ROGUE_ISP_MERGE_SCALE_FACTOR) /
973 (ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR -
974 ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR);
975 create_info->isp_merge_scale_y = fui(value);
976 }
977
978 create_info->isp_mtile_size =
979 pvr_rogue_get_cr_isp_mtile_size_val(dev_info,
980 rt_dataset->samples,
981 mtile_info);
982
983 /* PPP register values. */
984 create_info->ppp_multi_sample_ctl =
985 pvr_rogue_get_cr_multisamplectl_val(rt_dataset->samples, false);
986 create_info->ppp_multi_sample_ctl_y_flipped =
987 pvr_rogue_get_cr_multisamplectl_val(rt_dataset->samples, true);
988
989 pvr_csb_pack (&create_info->ppp_screen, CR_PPP_SCREEN, value) {
990 value.pixxmax = rt_dataset->width - 1;
991 value.pixymax = rt_dataset->height - 1;
992 }
993
994 /* TE register values. */
995 create_info->te_aa =
996 pvr_rogue_get_cr_te_aa_val(dev_info, rt_dataset->samples);
997
998 pvr_csb_pack (&create_info->te_mtile1, CR_TE_MTILE1, value) {
999 value.x1 = mtile_info->mtile_x1;
1000 if (!PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
1001 value.x2 = mtile_info->mtile_x2;
1002 value.x3 = mtile_info->mtile_x3;
1003 }
1004 }
1005
1006 pvr_csb_pack (&create_info->te_mtile2, CR_TE_MTILE2, value) {
1007 value.y1 = mtile_info->mtile_y1;
1008 if (!PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
1009 value.y2 = mtile_info->mtile_y2;
1010 value.y3 = mtile_info->mtile_y3;
1011 }
1012 }
1013
1014 pvr_csb_pack (&create_info->te_screen, CR_TE_SCREEN, value) {
1015 value.xmax = mtile_info->x_tile_max;
1016 value.ymax = mtile_info->y_tile_max;
1017 }
1018
1019 /* Allocations and associated information. */
1020 create_info->vheap_table_dev_addr = rt_dataset->vheap_dev_addr;
1021 create_info->rtc_dev_addr = rt_dataset->rtc_dev_addr;
1022
1023 create_info->tpc_dev_addr = rt_dataset->tpc_bo->vma->dev_addr;
1024 create_info->tpc_stride = rt_dataset->tpc_stride;
1025 create_info->tpc_size = rt_dataset->tpc_size;
1026
1027 STATIC_ASSERT(ARRAY_SIZE(create_info->rt_datas) ==
1028 ARRAY_SIZE(rt_dataset->rt_datas));
1029 for (uint32_t i = 0; i < ARRAY_SIZE(create_info->rt_datas); i++) {
1030 create_info->rt_datas[i].pm_mlist_dev_addr =
1031 rt_dataset->rt_datas[i].mlist_dev_addr;
1032 create_info->rt_datas[i].macrotile_array_dev_addr =
1033 rt_dataset->rt_datas[i].mta_dev_addr;
1034 create_info->rt_datas[i].rgn_header_dev_addr =
1035 rt_dataset->rt_datas[i].rgn_headers_dev_addr;
1036 }
1037
1038 create_info->rgn_header_size =
1039 pvr_rt_get_isp_region_size(device, mtile_info);
1040
1041 /* Miscellaneous. */
1042 create_info->mtile_stride = mtile_info->mtile_stride;
1043 create_info->max_rts = rt_dataset->layers;
1044 }
1045
1046 VkResult
pvr_render_target_dataset_create(struct pvr_device * device,uint32_t width,uint32_t height,uint32_t samples,uint32_t layers,struct pvr_rt_dataset ** const rt_dataset_out)1047 pvr_render_target_dataset_create(struct pvr_device *device,
1048 uint32_t width,
1049 uint32_t height,
1050 uint32_t samples,
1051 uint32_t layers,
1052 struct pvr_rt_dataset **const rt_dataset_out)
1053 {
1054 const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
1055 struct pvr_winsys_rt_dataset_create_info rt_dataset_create_info;
1056 struct pvr_rt_mtile_info mtile_info;
1057 struct pvr_rt_dataset *rt_dataset;
1058 VkResult result;
1059
1060 assert(device->global_free_list);
1061 assert(width <= rogue_get_render_size_max_x(dev_info));
1062 assert(height <= rogue_get_render_size_max_y(dev_info));
1063 assert(layers > 0 && layers <= PVR_MAX_FRAMEBUFFER_LAYERS);
1064
1065 pvr_rt_mtile_info_init(device, &mtile_info, width, height, samples);
1066
1067 rt_dataset = vk_zalloc(&device->vk.alloc,
1068 sizeof(*rt_dataset),
1069 8,
1070 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1071 if (!rt_dataset)
1072 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1073
1074 rt_dataset->device = device;
1075 rt_dataset->width = width;
1076 rt_dataset->height = height;
1077 rt_dataset->samples = samples;
1078 rt_dataset->layers = layers;
1079 rt_dataset->global_free_list = device->global_free_list;
1080
1081 /* The maximum supported free list size is based on the assumption that this
1082 * freelist (the "local" freelist) is always the minimum size required by
1083 * the hardware. See the documentation of ROGUE_FREE_LIST_MAX_SIZE for more
1084 * details.
1085 */
1086 result = pvr_free_list_create(device,
1087 rogue_get_min_free_list_size(dev_info),
1088 rogue_get_min_free_list_size(dev_info),
1089 0 /* grow_size */,
1090 0 /* grow_threshold */,
1091 rt_dataset->global_free_list,
1092 &rt_dataset->local_free_list);
1093 if (result != VK_SUCCESS)
1094 goto err_vk_free_rt_dataset;
1095
1096 result = pvr_rt_vheap_rtc_data_init(device, rt_dataset, layers);
1097 if (result != VK_SUCCESS)
1098 goto err_pvr_free_list_destroy;
1099
1100 result = pvr_rt_tpc_data_init(device, rt_dataset, &mtile_info, layers);
1101 if (result != VK_SUCCESS)
1102 goto err_pvr_rt_vheap_rtc_data_fini;
1103
1104 result = pvr_rt_datas_init(device,
1105 rt_dataset,
1106 rt_dataset->global_free_list,
1107 rt_dataset->local_free_list,
1108 &mtile_info,
1109 layers);
1110 if (result != VK_SUCCESS)
1111 goto err_pvr_rt_tpc_data_fini;
1112
1113 /* rt_dataset must be fully initialized by this point since
1114 * pvr_rt_dataset_ws_create_info_init() depends on this.
1115 */
1116 pvr_rt_dataset_ws_create_info_init(rt_dataset,
1117 &mtile_info,
1118 &rt_dataset_create_info);
1119
1120 result =
1121 device->ws->ops->render_target_dataset_create(device->ws,
1122 &rt_dataset_create_info,
1123 &rt_dataset->ws_rt_dataset);
1124 if (result != VK_SUCCESS)
1125 goto err_pvr_rt_datas_fini;
1126
1127 *rt_dataset_out = rt_dataset;
1128
1129 return VK_SUCCESS;
1130
1131 err_pvr_rt_datas_fini:
1132 pvr_rt_datas_fini(rt_dataset);
1133
1134 err_pvr_rt_tpc_data_fini:
1135 pvr_rt_tpc_data_fini(rt_dataset);
1136
1137 err_pvr_rt_vheap_rtc_data_fini:
1138 pvr_rt_vheap_rtc_data_fini(rt_dataset);
1139
1140 err_pvr_free_list_destroy:
1141 pvr_free_list_destroy(rt_dataset->local_free_list);
1142
1143 err_vk_free_rt_dataset:
1144 vk_free(&device->vk.alloc, rt_dataset);
1145
1146 return result;
1147 }
1148
pvr_render_target_dataset_destroy(struct pvr_rt_dataset * rt_dataset)1149 void pvr_render_target_dataset_destroy(struct pvr_rt_dataset *rt_dataset)
1150 {
1151 struct pvr_device *device = rt_dataset->device;
1152
1153 device->ws->ops->render_target_dataset_destroy(rt_dataset->ws_rt_dataset);
1154
1155 pvr_rt_datas_fini(rt_dataset);
1156 pvr_rt_tpc_data_fini(rt_dataset);
1157 pvr_rt_vheap_rtc_data_fini(rt_dataset);
1158
1159 pvr_free_list_destroy(rt_dataset->local_free_list);
1160
1161 vk_free(&device->vk.alloc, rt_dataset);
1162 }
1163
1164 static void
pvr_render_job_ws_geometry_state_init(struct pvr_render_ctx * ctx,struct pvr_render_job * job,struct pvr_winsys_geometry_state * state)1165 pvr_render_job_ws_geometry_state_init(struct pvr_render_ctx *ctx,
1166 struct pvr_render_job *job,
1167 struct pvr_winsys_geometry_state *state)
1168 {
1169 const struct pvr_device_info *dev_info = &ctx->device->pdevice->dev_info;
1170
1171 /* FIXME: Should this just be done unconditionally? The firmware will just
1172 * ignore the value anyway.
1173 */
1174 if (PVR_HAS_QUIRK(dev_info, 56279)) {
1175 pvr_csb_pack (&state->regs.pds_ctrl, CR_PDS_CTRL, value) {
1176 value.max_num_vdm_tasks = rogue_get_max_num_vdm_pds_tasks(dev_info);
1177 }
1178 } else {
1179 state->regs.pds_ctrl = 0;
1180 }
1181
1182 pvr_csb_pack (&state->regs.ppp_ctrl, CR_PPP_CTRL, value) {
1183 value.wclampen = true;
1184 value.fixed_point_format = 1;
1185 }
1186
1187 pvr_csb_pack (&state->regs.te_psg, CR_TE_PSG, value) {
1188 value.completeonterminate = job->geometry_terminate;
1189
1190 value.region_stride = job->rt_dataset->rgn_headers_stride /
1191 PVRX(CR_TE_PSG_REGION_STRIDE_UNIT_SIZE);
1192
1193 value.forcenewstate = PVR_HAS_QUIRK(dev_info, 52942);
1194 }
1195
1196 /* The set up of CR_TPU must be identical to
1197 * pvr_render_job_ws_fragment_state_init().
1198 */
1199 pvr_csb_pack (&state->regs.tpu, CR_TPU, value) {
1200 value.tag_cem_4k_face_packing = true;
1201 }
1202
1203 pvr_csb_pack (&state->regs.tpu_border_colour_table,
1204 CR_TPU_BORDER_COLOUR_TABLE_VDM,
1205 value) {
1206 value.border_colour_table_address = job->border_colour_table_addr;
1207 }
1208
1209 pvr_csb_pack (&state->regs.vdm_ctrl_stream_base,
1210 CR_VDM_CTRL_STREAM_BASE,
1211 value) {
1212 value.addr = job->ctrl_stream_addr;
1213 }
1214
1215 /* Set up the USC common size for the context switch resume/load program
1216 * (ctx->ctx_switch.programs[i].sr->pds_load_program), which was created
1217 * as part of the render context.
1218 */
1219 pvr_csb_pack (&state->regs.vdm_ctx_resume_task0_size,
1220 VDMCTRL_PDS_STATE0,
1221 value) {
1222 /* Calculate the size in bytes. */
1223 const uint16_t shared_registers_size = job->max_shared_registers * 4;
1224
1225 value.usc_common_size =
1226 DIV_ROUND_UP(shared_registers_size,
1227 PVRX(VDMCTRL_PDS_STATE0_USC_COMMON_SIZE_UNIT_SIZE));
1228 };
1229
1230 state->flags = 0;
1231
1232 if (!job->rt_dataset->need_frag)
1233 state->flags |= PVR_WINSYS_GEOM_FLAG_FIRST_GEOMETRY;
1234
1235 if (job->geometry_terminate)
1236 state->flags |= PVR_WINSYS_GEOM_FLAG_LAST_GEOMETRY;
1237
1238 if (job->frag_uses_atomic_ops)
1239 state->flags |= PVR_WINSYS_GEOM_FLAG_SINGLE_CORE;
1240 }
1241
1242 static inline void
pvr_get_isp_num_tiles_xy(const struct pvr_device_info * dev_info,uint32_t samples,uint32_t width,uint32_t height,uint32_t * const x_out,uint32_t * const y_out)1243 pvr_get_isp_num_tiles_xy(const struct pvr_device_info *dev_info,
1244 uint32_t samples,
1245 uint32_t width,
1246 uint32_t height,
1247 uint32_t *const x_out,
1248 uint32_t *const y_out)
1249 {
1250 uint32_t tile_samples_x;
1251 uint32_t tile_samples_y;
1252 uint32_t scale_x;
1253 uint32_t scale_y;
1254
1255 rogue_get_isp_samples_per_tile_xy(dev_info,
1256 samples,
1257 &tile_samples_x,
1258 &tile_samples_y);
1259
1260 switch (samples) {
1261 case 1:
1262 scale_x = 1;
1263 scale_y = 1;
1264 break;
1265 case 2:
1266 scale_x = 1;
1267 scale_y = 2;
1268 break;
1269 case 4:
1270 scale_x = 2;
1271 scale_y = 2;
1272 break;
1273 case 8:
1274 scale_x = 2;
1275 scale_y = 4;
1276 break;
1277 default:
1278 unreachable("Unsupported number of samples");
1279 }
1280
1281 *x_out = DIV_ROUND_UP(width * scale_x, tile_samples_x);
1282 *y_out = DIV_ROUND_UP(height * scale_y, tile_samples_y);
1283
1284 if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
1285 assert(PVR_GET_FEATURE_VALUE(dev_info,
1286 simple_parameter_format_version,
1287 0U) == 2U);
1288 /* Align to a 2x2 tile block. */
1289 *x_out = ALIGN_POT(*x_out, 2);
1290 *y_out = ALIGN_POT(*y_out, 2);
1291 }
1292 }
1293
1294 static void
pvr_render_job_ws_fragment_state_init(struct pvr_render_ctx * ctx,struct pvr_render_job * job,struct pvr_winsys_fragment_state * state)1295 pvr_render_job_ws_fragment_state_init(struct pvr_render_ctx *ctx,
1296 struct pvr_render_job *job,
1297 struct pvr_winsys_fragment_state *state)
1298 {
1299 const enum PVRX(CR_ISP_AA_MODE_TYPE)
1300 isp_aa_mode = pvr_cr_isp_aa_mode_type(job->samples);
1301 const struct pvr_device_runtime_info *dev_runtime_info =
1302 &ctx->device->pdevice->dev_runtime_info;
1303 const struct pvr_device_info *dev_info = &ctx->device->pdevice->dev_info;
1304 uint32_t isp_ctl;
1305
1306 /* FIXME: what to do when job->run_frag is false? */
1307
1308 /* FIXME: pass in the number of samples rather than isp_aa_mode? */
1309 pvr_setup_tiles_in_flight(dev_info,
1310 dev_runtime_info,
1311 isp_aa_mode,
1312 job->pixel_output_width,
1313 false,
1314 job->max_tiles_in_flight,
1315 &isp_ctl,
1316 &state->regs.usc_pixel_output_ctrl);
1317
1318 pvr_csb_pack (&state->regs.isp_ctl, CR_ISP_CTL, value) {
1319 value.sample_pos = true;
1320
1321 /* FIXME: There are a number of things that cause this to be set, this
1322 * is just one of them.
1323 */
1324 value.process_empty_tiles = job->process_empty_tiles;
1325 }
1326
1327 /* FIXME: When pvr_setup_tiles_in_flight() is refactored it might be
1328 * possible to fully pack CR_ISP_CTL above rather than having to OR in part
1329 * of the value.
1330 */
1331 state->regs.isp_ctl |= isp_ctl;
1332
1333 pvr_csb_pack (&state->regs.isp_aa, CR_ISP_AA, value) {
1334 value.mode = isp_aa_mode;
1335 }
1336
1337 /* The set up of CR_TPU must be identical to
1338 * pvr_render_job_ws_geometry_state_init().
1339 */
1340 pvr_csb_pack (&state->regs.tpu, CR_TPU, value) {
1341 value.tag_cem_4k_face_packing = true;
1342 }
1343
1344 if (PVR_HAS_FEATURE(dev_info, cluster_grouping) &&
1345 PVR_HAS_FEATURE(dev_info, slc_mcu_cache_controls) &&
1346 dev_runtime_info->num_phantoms > 1 && job->frag_uses_atomic_ops) {
1347 /* Each phantom has its own MCU, so atomicity can only be guaranteed
1348 * when all work items are processed on the same phantom. This means we
1349 * need to disable all USCs other than those of the first phantom, which
1350 * has 4 clusters. Note that we only need to do this for atomic
1351 * operations in fragment shaders, since hardware prevents the TA to run
1352 * on more than one phantom anyway.
1353 */
1354 state->regs.pixel_phantom = 0xF;
1355 } else {
1356 state->regs.pixel_phantom = 0;
1357 }
1358
1359 pvr_csb_pack (&state->regs.isp_bgobjvals, CR_ISP_BGOBJVALS, value) {
1360 value.enablebgtag = job->enable_bg_tag;
1361
1362 value.mask = true;
1363
1364 /* FIXME: Hard code this for now as we don't currently support any
1365 * stencil image formats.
1366 */
1367 value.stencil = 0xFF;
1368 }
1369
1370 pvr_csb_pack (&state->regs.isp_bgobjdepth, CR_ISP_BGOBJDEPTH, value) {
1371 /* FIXME: This is suitable for the single depth format the driver
1372 * currently supports, but may need updating to handle other depth
1373 * formats.
1374 */
1375 value.value = fui(job->depth_clear_value);
1376 }
1377
1378 /* FIXME: Some additional set up needed to support depth and stencil
1379 * load/store operations.
1380 */
1381 pvr_csb_pack (&state->regs.isp_zlsctl, CR_ISP_ZLSCTL, value) {
1382 uint32_t aligned_width =
1383 ALIGN_POT(job->depth_physical_width, ROGUE_IPF_TILE_SIZE_PIXELS);
1384 uint32_t aligned_height =
1385 ALIGN_POT(job->depth_physical_height, ROGUE_IPF_TILE_SIZE_PIXELS);
1386
1387 pvr_get_isp_num_tiles_xy(dev_info,
1388 job->samples,
1389 aligned_width,
1390 aligned_height,
1391 &value.zlsextent_x_z,
1392 &value.zlsextent_y_z);
1393 value.zlsextent_x_z -= 1;
1394 value.zlsextent_y_z -= 1;
1395
1396 if (job->depth_memlayout == PVR_MEMLAYOUT_TWIDDLED) {
1397 value.loadtwiddled = true;
1398 value.storetwiddled = true;
1399 }
1400
1401 /* FIXME: This is suitable for the single depth format the driver
1402 * currently supports, but may need updating to handle other depth
1403 * formats.
1404 */
1405 assert(job->depth_vk_format == VK_FORMAT_D32_SFLOAT);
1406 value.zloadformat = PVRX(CR_ZLOADFORMAT_TYPE_F32Z);
1407 value.zstoreformat = PVRX(CR_ZSTOREFORMAT_TYPE_F32Z);
1408 }
1409
1410 if (PVR_HAS_FEATURE(dev_info, zls_subtile)) {
1411 pvr_csb_pack (&state->regs.isp_zls_pixels, CR_ISP_ZLS_PIXELS, value) {
1412 value.x = job->depth_stride - 1;
1413 value.y = job->depth_height - 1;
1414 }
1415 } else {
1416 state->regs.isp_zls_pixels = 0;
1417 }
1418
1419 pvr_csb_pack (&state->regs.isp_zload_store_base, CR_ISP_ZLOAD_BASE, value) {
1420 value.addr = job->depth_addr;
1421 }
1422
1423 pvr_csb_pack (&state->regs.isp_stencil_load_store_base,
1424 CR_ISP_STENCIL_LOAD_BASE,
1425 value) {
1426 value.addr = job->stencil_addr;
1427
1428 /* FIXME: May need to set value.enable to true. */
1429 }
1430
1431 pvr_csb_pack (&state->regs.tpu_border_colour_table,
1432 CR_TPU_BORDER_COLOUR_TABLE_PDM,
1433 value) {
1434 value.border_colour_table_address = job->border_colour_table_addr;
1435 }
1436
1437 state->regs.isp_oclqry_base = 0;
1438
1439 pvr_csb_pack (&state->regs.isp_dbias_base, CR_ISP_DBIAS_BASE, value) {
1440 value.addr = job->depth_bias_table_addr;
1441 }
1442
1443 pvr_csb_pack (&state->regs.isp_scissor_base, CR_ISP_SCISSOR_BASE, value) {
1444 value.addr = job->scissor_table_addr;
1445 }
1446
1447 pvr_csb_pack (&state->regs.event_pixel_pds_info,
1448 CR_EVENT_PIXEL_PDS_INFO,
1449 value) {
1450 value.const_size =
1451 DIV_ROUND_UP(ctx->device->pixel_event_data_size_in_dwords,
1452 PVRX(CR_EVENT_PIXEL_PDS_INFO_CONST_SIZE_UNIT_SIZE));
1453 value.temp_stride = 0;
1454 value.usc_sr_size =
1455 DIV_ROUND_UP(PVR_STATE_PBE_DWORDS,
1456 PVRX(CR_EVENT_PIXEL_PDS_INFO_USC_SR_SIZE_UNIT_SIZE));
1457 }
1458
1459 pvr_csb_pack (&state->regs.event_pixel_pds_data,
1460 CR_EVENT_PIXEL_PDS_DATA,
1461 value) {
1462 value.addr = PVR_DEV_ADDR(job->pds_pixel_event_data_offset);
1463 }
1464
1465 STATIC_ASSERT(ARRAY_SIZE(state->regs.pbe_word) ==
1466 ARRAY_SIZE(job->pbe_reg_words));
1467 STATIC_ASSERT(ARRAY_SIZE(state->regs.pbe_word[0]) ==
1468 ARRAY_SIZE(job->pbe_reg_words[0]));
1469
1470 for (uint32_t i = 0; i < ARRAY_SIZE(job->pbe_reg_words); i++) {
1471 state->regs.pbe_word[i][0] = job->pbe_reg_words[i][0];
1472 state->regs.pbe_word[i][1] = job->pbe_reg_words[i][1];
1473 state->regs.pbe_word[i][2] = job->pbe_reg_words[i][2];
1474 }
1475
1476 STATIC_ASSERT(__same_type(state->regs.pds_bgnd, job->pds_bgnd_reg_values));
1477 typed_memcpy(state->regs.pds_bgnd,
1478 job->pds_bgnd_reg_values,
1479 ARRAY_SIZE(state->regs.pds_bgnd));
1480
1481 memset(state->regs.pds_pr_bgnd, 0, sizeof(state->regs.pds_pr_bgnd));
1482
1483 /* FIXME: Merge geometry and fragment flags into a single flags member? */
1484 /* FIXME: move to its own function? */
1485 state->flags = 0;
1486
1487 if (job->depth_addr.addr)
1488 state->flags |= PVR_WINSYS_FRAG_FLAG_DEPTH_BUFFER_PRESENT;
1489
1490 if (job->stencil_addr.addr)
1491 state->flags |= PVR_WINSYS_FRAG_FLAG_STENCIL_BUFFER_PRESENT;
1492
1493 if (job->disable_compute_overlap)
1494 state->flags |= PVR_WINSYS_FRAG_FLAG_PREVENT_CDM_OVERLAP;
1495
1496 if (job->frag_uses_atomic_ops)
1497 state->flags |= PVR_WINSYS_FRAG_FLAG_SINGLE_CORE;
1498
1499 state->zls_stride = job->depth_layer_size;
1500 state->sls_stride = job->depth_layer_size;
1501 }
1502
pvr_render_job_ws_submit_info_init(struct pvr_render_ctx * ctx,struct pvr_render_job * job,const struct pvr_winsys_job_bo * bos,uint32_t bo_count,struct vk_sync ** waits,uint32_t wait_count,uint32_t * stage_flags,struct pvr_winsys_render_submit_info * submit_info)1503 static void pvr_render_job_ws_submit_info_init(
1504 struct pvr_render_ctx *ctx,
1505 struct pvr_render_job *job,
1506 const struct pvr_winsys_job_bo *bos,
1507 uint32_t bo_count,
1508 struct vk_sync **waits,
1509 uint32_t wait_count,
1510 uint32_t *stage_flags,
1511 struct pvr_winsys_render_submit_info *submit_info)
1512 {
1513 memset(submit_info, 0, sizeof(*submit_info));
1514
1515 submit_info->rt_dataset = job->rt_dataset->ws_rt_dataset;
1516 submit_info->rt_data_idx = job->rt_dataset->rt_data_idx;
1517
1518 submit_info->frame_num = ctx->device->global_queue_present_count;
1519 submit_info->job_num = ctx->device->global_queue_job_count;
1520
1521 submit_info->run_frag = job->run_frag;
1522
1523 submit_info->bos = bos;
1524 submit_info->bo_count = bo_count;
1525
1526 submit_info->waits = waits;
1527 submit_info->wait_count = wait_count;
1528 submit_info->stage_flags = stage_flags;
1529
1530 /* FIXME: add WSI image bos. */
1531
1532 pvr_render_job_ws_geometry_state_init(ctx, job, &submit_info->geometry);
1533 pvr_render_job_ws_fragment_state_init(ctx, job, &submit_info->fragment);
1534
1535 /* These values are expected to match. */
1536 assert(submit_info->geometry.regs.tpu == submit_info->fragment.regs.tpu);
1537 }
1538
pvr_render_job_submit(struct pvr_render_ctx * ctx,struct pvr_render_job * job,const struct pvr_winsys_job_bo * bos,uint32_t bo_count,struct vk_sync ** waits,uint32_t wait_count,uint32_t * stage_flags,struct vk_sync * signal_sync_geom,struct vk_sync * signal_sync_frag)1539 VkResult pvr_render_job_submit(struct pvr_render_ctx *ctx,
1540 struct pvr_render_job *job,
1541 const struct pvr_winsys_job_bo *bos,
1542 uint32_t bo_count,
1543 struct vk_sync **waits,
1544 uint32_t wait_count,
1545 uint32_t *stage_flags,
1546 struct vk_sync *signal_sync_geom,
1547 struct vk_sync *signal_sync_frag)
1548 {
1549 struct pvr_rt_dataset *rt_dataset = job->rt_dataset;
1550 struct pvr_winsys_render_submit_info submit_info;
1551 struct pvr_device *device = ctx->device;
1552 VkResult result;
1553
1554 pvr_render_job_ws_submit_info_init(ctx,
1555 job,
1556 bos,
1557 bo_count,
1558 waits,
1559 wait_count,
1560 stage_flags,
1561 &submit_info);
1562
1563 result = device->ws->ops->render_submit(ctx->ws_ctx,
1564 &submit_info,
1565 signal_sync_geom,
1566 signal_sync_frag);
1567 if (result != VK_SUCCESS)
1568 return result;
1569
1570 if (job->run_frag) {
1571 /* Move to the next render target data now that a fragment job has been
1572 * successfully submitted. This will allow the next geometry job to be
1573 * submitted to been run in parallel with it.
1574 */
1575 rt_dataset->rt_data_idx =
1576 (rt_dataset->rt_data_idx + 1) % ARRAY_SIZE(rt_dataset->rt_datas);
1577
1578 rt_dataset->need_frag = false;
1579 } else {
1580 rt_dataset->need_frag = true;
1581 }
1582
1583 return VK_SUCCESS;
1584 }
1585