• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  * SPDX-License-Identifier: MIT
5  *
6  * based in part on anv driver which is:
7  * Copyright © 2015 Intel Corporation
8  */
9 
10 #ifndef TU_DEVICE_H
11 #define TU_DEVICE_H
12 
13 #include "tu_common.h"
14 
15 #include "vk_device_memory.h"
16 #include "vk_meta.h"
17 
18 #include "tu_autotune.h"
19 #include "tu_cs.h"
20 #include "tu_pass.h"
21 #include "tu_perfetto.h"
22 #include "tu_suballoc.h"
23 #include "tu_util.h"
24 
25 #include "radix_sort/radix_sort_vk.h"
26 
27 #include "common/freedreno_rd_output.h"
28 #include "util/vma.h"
29 #include "util/u_vector.h"
30 
31 /* queue types */
32 #define TU_QUEUE_GENERAL 0
33 
34 #define TU_MAX_QUEUE_FAMILIES 1
35 
36 #define TU_BORDER_COLOR_COUNT 4096
37 #define TU_BORDER_COLOR_BUILTIN 6
38 
39 #define TU_BLIT_SHADER_SIZE 4096
40 
41 /* extra space in vsc draw/prim streams */
42 #define VSC_PAD 0x40
43 
44 enum global_shader {
45    GLOBAL_SH_VS_BLIT,
46    GLOBAL_SH_VS_CLEAR,
47    GLOBAL_SH_FS_BLIT,
48    GLOBAL_SH_FS_BLIT_ZSCALE,
49    GLOBAL_SH_FS_COPY_MS,
50    GLOBAL_SH_FS_COPY_MS_HALF,
51    GLOBAL_SH_FS_CLEAR0,
52    GLOBAL_SH_FS_CLEAR_MAX = GLOBAL_SH_FS_CLEAR0 + MAX_RTS,
53    GLOBAL_SH_COUNT,
54 };
55 
56 struct tu_memory_heap {
57    /* Standard bits passed on to the client */
58    VkDeviceSize      size;
59    VkMemoryHeapFlags flags;
60 
61    /** Copied from ANV:
62     *
63     * Driver-internal book-keeping.
64     *
65     * Align it to 64 bits to make atomic operations faster on 32 bit platforms.
66     */
67    alignas(8) VkDeviceSize used;
68 };
69 
70 enum tu_kgsl_dma_type
71 {
72    TU_KGSL_DMA_TYPE_ION_LEGACY,
73    TU_KGSL_DMA_TYPE_ION,
74    TU_KGSL_DMA_TYPE_DMAHEAP,
75 };
76 
77 extern uint64_t os_page_size;
78 
79 struct tu_physical_device
80 {
81    struct vk_physical_device vk;
82 
83    struct tu_instance *instance;
84 
85    const char *name;
86    uint8_t driver_uuid[VK_UUID_SIZE];
87    uint8_t device_uuid[VK_UUID_SIZE];
88    uint8_t cache_uuid[VK_UUID_SIZE];
89 
90    struct wsi_device wsi_device;
91 
92    char fd_path[20];
93    int local_fd;
94    bool has_local;
95    int64_t local_major;
96    int64_t local_minor;
97    int master_fd;
98    bool has_master;
99    int64_t master_major;
100    int64_t master_minor;
101 
102    int kgsl_dma_fd;
103    enum tu_kgsl_dma_type kgsl_dma_type;
104 
105    uint32_t gmem_size;
106    uint64_t gmem_base;
107 
108    uint32_t usable_gmem_size_gmem;
109    uint32_t ccu_offset_gmem;
110    uint32_t ccu_offset_bypass;
111    uint32_t ccu_depth_offset_bypass;
112    uint32_t vpc_attr_buf_offset_gmem;
113    uint32_t vpc_attr_buf_size_gmem;
114    uint32_t vpc_attr_buf_offset_bypass;
115    uint32_t vpc_attr_buf_size_bypass;
116 
117    /* Amount of usable descriptor sets, this excludes any reserved set */
118    uint32_t usable_sets;
119    /* Index of the reserved descriptor set, may be -1 if unset */
120    int32_t reserved_set_idx;
121 
122    bool has_set_iova;
123    bool has_raytracing;
124    uint64_t va_start;
125    uint64_t va_size;
126 
127    bool has_cached_coherent_memory;
128    bool has_cached_non_coherent_memory;
129    uintptr_t level1_dcache_size;
130 
131    struct fdl_ubwc_config ubwc_config;
132 
133    bool has_preemption;
134 
135    struct {
136       uint32_t type_count;
137       VkMemoryPropertyFlags types[VK_MAX_MEMORY_TYPES];
138    } memory;
139 
140    struct fd_dev_id dev_id;
141    struct fd_dev_info dev_info;
142    const struct fd_dev_info *info;
143 
144    int msm_major_version;
145    int msm_minor_version;
146 
147    /* with 0 being the highest priority */
148    uint32_t submitqueue_priority_count;
149 
150    struct tu_memory_heap heap;
151 
152    struct vk_sync_type syncobj_type;
153    struct vk_sync_timeline_type timeline_type;
154    const struct vk_sync_type *sync_types[3];
155 
156    uint32_t device_count;
157 };
158 VK_DEFINE_HANDLE_CASTS(tu_physical_device, vk.base, VkPhysicalDevice,
159                        VK_OBJECT_TYPE_PHYSICAL_DEVICE)
160 
161 struct tu_knl;
162 
163 struct tu_instance
164 {
165    struct vk_instance vk;
166 
167    const struct tu_knl *knl;
168 
169    uint32_t instance_idx;
170    uint32_t api_version;
171 
172    struct driOptionCache dri_options;
173    struct driOptionCache available_dri_options;
174 
175    bool dont_care_as_load;
176 
177    /* Conservative LRZ (default true) invalidates LRZ on draws with
178     * blend and depth-write enabled, because this can lead to incorrect
179     * rendering.  Driconf can be used to disable conservative LRZ for
180     * games which do not have the problematic sequence of draws *and*
181     * suffer a performance loss with conservative LRZ.
182     */
183    bool conservative_lrz;
184 
185    /* If to internally reserve a descriptor set for descriptor set
186     * dynamic offsets, a descriptor set can be freed at the cost of
187     * being unable to use the feature. As it is a part of the Vulkan
188     * core, this is enabled by default.
189     */
190    bool reserve_descriptor_set;
191 
192    /* Allow out of bounds UBO access by disabling lowering of UBO loads for
193     * indirect access, which rely on the UBO bounds specified in the shader,
194     * rather than the bound UBO size which isn't known until draw time.
195     *
196     * See: https://github.com/doitsujin/dxvk/issues/3861
197     */
198    bool allow_oob_indirect_ubo_loads;
199 
200    /* DXVK and VKD3D-Proton use customBorderColorWithoutFormat
201     * and have most of D24S8 images with USAGE_SAMPLED, in such case we
202     * disable UBWC for correctness. However, games don't use border color for
203     * depth-stencil images. So we elect to ignore this edge case and force
204     * UBWC to be enabled.
205     */
206    bool disable_d24s8_border_color_workaround;
207 };
208 VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
209                        VK_OBJECT_TYPE_INSTANCE)
210 
211 /* This struct defines the layout of the global_bo */
212 struct tu6_global
213 {
214    /* clear/blit shaders */
215    uint32_t shaders[TU_BLIT_SHADER_SIZE];
216 
217    uint32_t seqno_dummy;          /* dummy seqno for CP_EVENT_WRITE */
218    uint32_t _pad0;
219    volatile uint32_t vsc_draw_overflow;
220    uint32_t _pad1;
221    volatile uint32_t vsc_prim_overflow;
222    uint32_t _pad2;
223    uint64_t predicate;
224 
225    /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */
226    struct {
227       uint32_t offset;
228       uint32_t pad[7];
229    } flush_base[4];
230 
231    alignas(16) uint32_t cs_indirect_xyz[12];
232 
233    uint32_t vsc_state[32];
234 
235    volatile uint32_t vtx_stats_query_not_running;
236 
237    /* To know when renderpass stats for autotune are valid */
238    volatile uint32_t autotune_fence;
239 
240    /* For recycling command buffers for dynamic suspend/resume comamnds */
241    volatile uint32_t dynamic_rendering_fence;
242 
243    volatile uint32_t dbg_one;
244    volatile uint32_t dbg_gmem_total_loads;
245    volatile uint32_t dbg_gmem_taken_loads;
246    volatile uint32_t dbg_gmem_total_stores;
247    volatile uint32_t dbg_gmem_taken_stores;
248 
249    /* Written from GPU */
250    volatile uint32_t breadcrumb_gpu_sync_seqno;
251    uint32_t _pad3;
252    /* Written from CPU, acknowledges value written from GPU */
253    volatile uint32_t breadcrumb_cpu_sync_seqno;
254    uint32_t _pad4;
255 
256    volatile uint32_t userspace_fence;
257    uint32_t _pad5;
258 
259    /* note: larger global bo will be used for customBorderColors */
260    struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[];
261 };
262 #define gb_offset(member) offsetof(struct tu6_global, member)
263 #define global_iova(cmd, member) ((cmd)->device->global_bo->iova + gb_offset(member))
264 #define global_iova_arr(cmd, member, idx)                                    \
265    (global_iova(cmd, member) + sizeof_field(struct tu6_global, member[0]) * (idx))
266 
267 struct tu_pvtmem_bo {
268       mtx_t mtx;
269       struct tu_bo *bo;
270       uint32_t per_fiber_size, per_sp_size;
271 };
272 
273 struct tu_virtio_device;
274 struct tu_queue;
275 
276 struct tu_device
277 {
278    struct vk_device vk;
279    struct tu_instance *instance;
280 
281    struct tu_queue *queues[TU_MAX_QUEUE_FAMILIES];
282    int queue_count[TU_MAX_QUEUE_FAMILIES];
283 
284    struct tu_physical_device *physical_device;
285    uint32_t device_idx;
286    int fd;
287 
288    struct ir3_compiler *compiler;
289 
290    /* Backup in-memory cache to be used if the app doesn't provide one */
291    struct vk_pipeline_cache *mem_cache;
292 
293    struct vk_meta_device meta;
294 
295    radix_sort_vk_t *radix_sort;
296    mtx_t radix_sort_mutex;
297 
298    struct util_sparse_array accel_struct_ranges;
299 
300 #define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */
301 
302    /* Currently the kernel driver uses a 32-bit GPU address space, but it
303     * should be impossible to go beyond 48 bits.
304     */
305    struct {
306       struct tu_bo *bo;
307       mtx_t construct_mtx;
308       bool initialized;
309    } scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2];
310 
311    struct tu_pvtmem_bo fiber_pvtmem_bo, wave_pvtmem_bo;
312 
313    struct tu_bo *global_bo;
314    struct tu6_global *global_bo_map;
315 
316    struct tu_bo *null_accel_struct_bo;
317 
318    uint32_t implicit_sync_bo_count;
319 
320    /* Device-global BO suballocator for reducing BO management overhead for
321     * (read-only) pipeline state.  Synchronized by pipeline_mutex.
322     */
323    struct tu_suballocator pipeline_suballoc;
324    mtx_t pipeline_mutex;
325 
326    /* Device-global BO suballocator for reducing BO management for small
327     * gmem/sysmem autotune result buffers.  Synchronized by autotune_mutex.
328     */
329    struct tu_suballocator autotune_suballoc;
330    mtx_t autotune_mutex;
331 
332    /* KGSL requires a small chunk of GPU mem to retrieve raw GPU time on
333     * each submission.
334     */
335    struct tu_suballocator kgsl_profiling_suballoc;
336    mtx_t kgsl_profiling_mutex;
337 
338    /* the blob seems to always use 8K factor and 128K param sizes, copy them */
339 #define TU_TESS_FACTOR_SIZE (8 * 1024)
340 #define TU_TESS_PARAM_SIZE (128 * 1024)
341 #define TU_TESS_BO_SIZE (TU_TESS_FACTOR_SIZE + TU_TESS_PARAM_SIZE)
342    /* Lazily allocated, protected by the device mutex. */
343    struct tu_bo *tess_bo;
344 
345    struct ir3_shader_variant *global_shader_variants[GLOBAL_SH_COUNT];
346    struct ir3_shader *global_shaders[GLOBAL_SH_COUNT];
347    uint64_t global_shader_va[GLOBAL_SH_COUNT];
348 
349    struct tu_shader *empty_tcs, *empty_tes, *empty_gs, *empty_fs, *empty_fs_fdm;
350 
351    uint32_t vsc_draw_strm_pitch;
352    uint32_t vsc_prim_strm_pitch;
353    BITSET_DECLARE(custom_border_color, TU_BORDER_COLOR_COUNT);
354    mtx_t mutex;
355 
356    mtx_t vma_mutex;
357    struct util_vma_heap vma;
358 
359    /* bo list for submits: */
360    struct drm_msm_gem_submit_bo *submit_bo_list;
361    /* map bo handles to bo list index: */
362    uint32_t submit_bo_count, submit_bo_list_size;
363    /* bo list for dumping: */
364    struct util_dynarray dump_bo_list;
365    mtx_t bo_mutex;
366    /* protects imported BOs creation/freeing */
367    struct u_rwlock dma_bo_lock;
368 
369    /* Tracking of name -> size allocated for TU_DEBUG_BOS */
370    struct hash_table *bo_sizes;
371 
372    /* This array holds all our 'struct tu_bo' allocations. We use this
373     * so we can add a refcount to our BOs and check if a particular BO
374     * was already allocated in this device using its GEM handle. This is
375     * necessary to properly manage BO imports, because the kernel doesn't
376     * refcount the underlying BO memory.
377     *
378     * Specifically, when self-importing (i.e. importing a BO into the same
379     * device that created it), the kernel will give us the same BO handle
380     * for both BOs and we must only free it once when  both references are
381     * freed. Otherwise, if we are not self-importing, we get two different BO
382     * handles, and we want to free each one individually.
383     *
384     * The refcount is also useful for being able to maintain BOs across
385     * VK object lifetimes, such as pipelines suballocating out of BOs
386     * allocated on the device.
387     */
388    struct util_sparse_array bo_map;
389 
390    /* We cannot immediately free VMA when freeing BO, kernel truly
391     * frees BO when it stops being busy.
392     * So we have to free our VMA only after the kernel does it.
393     */
394    struct u_vector zombie_vmas;
395 
396    struct tu_cs sub_cs;
397 
398    /* Command streams to set pass index to a scratch reg */
399    struct tu_cs_entry *perfcntrs_pass_cs_entries;
400 
401    struct tu_cs_entry cmdbuf_start_a725_quirk_entry;
402 
403    struct tu_cs_entry bin_preamble_entry;
404 
405    struct util_dynarray dynamic_rendering_pending;
406    VkCommandPool dynamic_rendering_pool;
407    uint32_t dynamic_rendering_fence;
408 
409    /* Condition variable for timeline semaphore to notify waiters when a
410     * new submit is executed. */
411    pthread_cond_t timeline_cond;
412    pthread_mutex_t submit_mutex;
413 
414    struct tu_autotune autotune;
415 
416    struct breadcrumbs_context *breadcrumbs_ctx;
417 
418    struct tu_cs *dbg_cmdbuf_stomp_cs;
419    struct tu_cs *dbg_renderpass_stomp_cs;
420 
421 #ifdef TU_HAS_VIRTIO
422    struct tu_virtio_device *vdev;
423 #endif
424 
425    uint32_t submit_count;
426 
427    /* Address space and global fault count for this local_fd with DRM backend */
428    uint64_t fault_count;
429 
430    struct u_trace_context trace_context;
431 
432    #ifdef HAVE_PERFETTO
433    struct tu_perfetto_state perfetto;
434    #endif
435 
436    bool use_z24uint_s8uint;
437    bool use_lrz;
438 
439    struct fd_rd_output rd_output;
440 };
441 VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
442 
443 struct tu_device_memory
444 {
445    struct vk_device_memory vk;
446 
447    struct tu_bo *bo;
448 
449    /* for dedicated allocations */
450    struct tu_image *image;
451 };
452 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, vk.base, VkDeviceMemory,
453                                VK_OBJECT_TYPE_DEVICE_MEMORY)
454 
455 struct tu_attachment_info
456 {
457    struct tu_image_view *attachment;
458 };
459 
460 struct tu_tiling_config {
461    /* size of the first tile */
462    VkExtent2D tile0;
463    /* number of tiles */
464    VkExtent2D tile_count;
465 
466    /* size of the first VSC pipe */
467    VkExtent2D pipe0;
468    /* number of VSC pipes */
469    VkExtent2D pipe_count;
470 
471    /* Whether using GMEM is even possible with this configuration */
472    bool possible;
473 
474    /* Whether binning should be used for gmem rendering using this framebuffer. */
475    bool binning;
476 
477    /* Whether binning could be used for gmem rendering using this framebuffer. */
478    bool binning_possible;
479 
480    /* pipe register values */
481    uint32_t pipe_config[MAX_VSC_PIPES];
482    uint32_t pipe_sizes[MAX_VSC_PIPES];
483 };
484 
485 struct tu_framebuffer
486 {
487    struct vk_object_base base;
488 
489    uint32_t width;
490    uint32_t height;
491    uint32_t layers;
492 
493    struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT];
494 
495    uint32_t attachment_count;
496    struct tu_attachment_info attachments[0];
497 };
498 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, base, VkFramebuffer,
499                                VK_OBJECT_TYPE_FRAMEBUFFER)
500 
501 uint64_t
502 tu_get_system_heap_size(struct tu_physical_device *physical_device);
503 
504 VkResult
505 tu_physical_device_init(struct tu_physical_device *device,
506                         struct tu_instance *instance);
507 
508 void
509 tu_physical_device_get_global_priority_properties(const struct tu_physical_device *pdevice,
510                                                   VkQueueFamilyGlobalPriorityPropertiesKHR *props);
511 
512 uint64_t
513 tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
514 
515 static inline struct tu_bo *
tu_device_lookup_bo(struct tu_device * device,uint32_t handle)516 tu_device_lookup_bo(struct tu_device *device, uint32_t handle)
517 {
518    return (struct tu_bo *) util_sparse_array_get(&device->bo_map, handle);
519 }
520 
521 struct u_trace_context *
522 tu_device_get_u_trace(struct tu_device *device);
523 
524 /* Get a scratch bo for use inside a command buffer. This will always return
525  * the same bo given the same size or similar sizes, so only one scratch bo
526  * can be used at the same time. It's meant for short-lived things where we
527  * need to write to some piece of memory, read from it, and then immediately
528  * discard it.
529  */
530 VkResult
531 tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo);
532 
533 void tu_setup_dynamic_framebuffer(struct tu_cmd_buffer *cmd_buffer,
534                                   const VkRenderingInfo *pRenderingInfo);
535 
536 void
537 tu_copy_buffer(struct u_trace_context *utctx, void *cmdstream,
538                void *ts_from, uint64_t from_offset_B,
539                void *ts_to, uint64_t to_offset_B,
540                uint64_t size_B);
541 
542 
543 VkResult
544 tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
545                             struct u_trace **trace_copy);
546 
547 /* If we copy trace and timestamps we will have to free them. */
548 struct tu_u_trace_cmd_data
549 {
550    struct tu_cs *timestamp_copy_cs;
551    struct u_trace *trace;
552 };
553 
554 /* Data necessary to retrieve timestamps and clean all
555  * associated resources afterwards.
556  */
557 struct tu_u_trace_submission_data
558 {
559    uint32_t submission_id;
560 
561    /* We have to know when timestamps are available,
562     * this queue and fence indicates it.
563     */
564    struct tu_queue *queue;
565    uint32_t fence;
566 
567    uint32_t cmd_buffer_count;
568    uint32_t last_buffer_with_tracepoints;
569    struct tu_u_trace_cmd_data *cmd_trace_data;
570 
571    /* GPU time is reset on GPU power cycle and the GPU time
572     * offset may change between submissions due to power cycle.
573     */
574    uint64_t gpu_ts_offset;
575 
576    /* KGSL needs a GPU memory to write submission timestamps into */
577    struct tu_suballoc_bo kgsl_timestamp_bo;
578 };
579 
580 VkResult
581 tu_u_trace_submission_data_create(
582    struct tu_device *device,
583    struct tu_cmd_buffer **cmd_buffers,
584    uint32_t cmd_buffer_count,
585    struct tu_u_trace_submission_data **submission_data);
586 
587 void
588 tu_u_trace_submission_data_finish(
589    struct tu_device *device,
590    struct tu_u_trace_submission_data *submission_data);
591 
592 const char *
593 tu_debug_bos_add(struct tu_device *dev, uint64_t size, const char *name);
594 void
595 tu_debug_bos_del(struct tu_device *dev, struct tu_bo *bo);
596 void
597 tu_debug_bos_print_stats(struct tu_device *dev);
598 
599 void
600 tu_dump_bo_init(struct tu_device *dev, struct tu_bo *bo);
601 void
602 tu_dump_bo_del(struct tu_device *dev, struct tu_bo *bo);
603 
604 
605 #endif /* TU_DEVICE_H */
606