• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  * SPDX-License-Identifier: MIT
5  *
6  * based in part on anv driver which is:
7  * Copyright © 2015 Intel Corporation
8  */
9 
10 #ifndef TU_DEVICE_H
11 #define TU_DEVICE_H
12 
13 #include "tu_common.h"
14 
15 #include "vk_device_memory.h"
16 
17 #include "tu_autotune.h"
18 #include "tu_cs.h"
19 #include "tu_pass.h"
20 #include "tu_perfetto.h"
21 #include "tu_suballoc.h"
22 #include "tu_util.h"
23 
24 #include "common/freedreno_rd_output.h"
25 #include "util/vma.h"
26 #include "util/u_vector.h"
27 
28 /* queue types */
29 #define TU_QUEUE_GENERAL 0
30 
31 #define TU_MAX_QUEUE_FAMILIES 1
32 
33 #define TU_BORDER_COLOR_COUNT 4096
34 #define TU_BORDER_COLOR_BUILTIN 6
35 
36 #define TU_BLIT_SHADER_SIZE 4096
37 
38 /* extra space in vsc draw/prim streams */
39 #define VSC_PAD 0x40
40 
41 enum global_shader {
42    GLOBAL_SH_VS_BLIT,
43    GLOBAL_SH_VS_CLEAR,
44    GLOBAL_SH_FS_BLIT,
45    GLOBAL_SH_FS_BLIT_ZSCALE,
46    GLOBAL_SH_FS_COPY_MS,
47    GLOBAL_SH_FS_COPY_MS_HALF,
48    GLOBAL_SH_FS_CLEAR0,
49    GLOBAL_SH_FS_CLEAR_MAX = GLOBAL_SH_FS_CLEAR0 + MAX_RTS,
50    GLOBAL_SH_COUNT,
51 };
52 
53 struct tu_memory_heap {
54    /* Standard bits passed on to the client */
55    VkDeviceSize      size;
56    VkMemoryHeapFlags flags;
57 
58    /** Copied from ANV:
59     *
60     * Driver-internal book-keeping.
61     *
62     * Align it to 64 bits to make atomic operations faster on 32 bit platforms.
63     */
64    alignas(8) VkDeviceSize used;
65 };
66 
67 enum tu_kgsl_dma_type
68 {
69    TU_KGSL_DMA_TYPE_ION_LEGACY,
70    TU_KGSL_DMA_TYPE_ION,
71    TU_KGSL_DMA_TYPE_DMAHEAP,
72 };
73 
74 extern uint64_t os_page_size;
75 
76 struct tu_physical_device
77 {
78    struct vk_physical_device vk;
79 
80    struct tu_instance *instance;
81 
82    const char *name;
83    uint8_t driver_uuid[VK_UUID_SIZE];
84    uint8_t device_uuid[VK_UUID_SIZE];
85    uint8_t cache_uuid[VK_UUID_SIZE];
86 
87    struct wsi_device wsi_device;
88 
89    char fd_path[20];
90    int local_fd;
91    bool has_local;
92    int64_t local_major;
93    int64_t local_minor;
94    int master_fd;
95    bool has_master;
96    int64_t master_major;
97    int64_t master_minor;
98 
99    int kgsl_dma_fd;
100    enum tu_kgsl_dma_type kgsl_dma_type;
101 
102    uint32_t gmem_size;
103    uint64_t gmem_base;
104 
105    uint32_t usable_gmem_size_gmem;
106    uint32_t ccu_offset_gmem;
107    uint32_t ccu_offset_bypass;
108    uint32_t ccu_depth_offset_bypass;
109    uint32_t vpc_attr_buf_offset_gmem;
110    uint32_t vpc_attr_buf_size_gmem;
111    uint32_t vpc_attr_buf_offset_bypass;
112    uint32_t vpc_attr_buf_size_bypass;
113 
114    /* Amount of usable descriptor sets, this excludes any reserved set */
115    uint32_t usable_sets;
116    /* Index of the reserved descriptor set, may be -1 if unset */
117    int32_t reserved_set_idx;
118 
119    bool has_set_iova;
120    uint64_t va_start;
121    uint64_t va_size;
122 
123    bool has_cached_coherent_memory;
124    bool has_cached_non_coherent_memory;
125    uintptr_t level1_dcache_size;
126 
127    struct fdl_ubwc_config ubwc_config;
128 
129    bool has_preemption;
130 
131    struct {
132       uint32_t type_count;
133       VkMemoryPropertyFlags types[VK_MAX_MEMORY_TYPES];
134    } memory;
135 
136    struct fd_dev_id dev_id;
137    struct fd_dev_info dev_info;
138    const struct fd_dev_info *info;
139 
140    int msm_major_version;
141    int msm_minor_version;
142 
143    /* with 0 being the highest priority */
144    uint32_t submitqueue_priority_count;
145 
146    struct tu_memory_heap heap;
147 
148    struct vk_sync_type syncobj_type;
149    struct vk_sync_timeline_type timeline_type;
150    const struct vk_sync_type *sync_types[3];
151 
152    uint32_t device_count;
153 };
154 VK_DEFINE_HANDLE_CASTS(tu_physical_device, vk.base, VkPhysicalDevice,
155                        VK_OBJECT_TYPE_PHYSICAL_DEVICE)
156 
157 struct tu_knl;
158 
159 struct tu_instance
160 {
161    struct vk_instance vk;
162 
163    const struct tu_knl *knl;
164 
165    uint32_t instance_idx;
166    uint32_t api_version;
167 
168    struct driOptionCache dri_options;
169    struct driOptionCache available_dri_options;
170 
171    bool dont_care_as_load;
172 
173    /* Conservative LRZ (default true) invalidates LRZ on draws with
174     * blend and depth-write enabled, because this can lead to incorrect
175     * rendering.  Driconf can be used to disable conservative LRZ for
176     * games which do not have the problematic sequence of draws *and*
177     * suffer a performance loss with conservative LRZ.
178     */
179    bool conservative_lrz;
180 
181    /* If to internally reserve a descriptor set for descriptor set
182     * dynamic offsets, a descriptor set can be freed at the cost of
183     * being unable to use the feature. As it is a part of the Vulkan
184     * core, this is enabled by default.
185     */
186    bool reserve_descriptor_set;
187 
188    /* Allow out of bounds UBO access by disabling lowering of UBO loads for
189     * indirect access, which rely on the UBO bounds specified in the shader,
190     * rather than the bound UBO size which isn't known until draw time.
191     *
192     * See: https://github.com/doitsujin/dxvk/issues/3861
193     */
194    bool allow_oob_indirect_ubo_loads;
195 
196    /* DXVK and VKD3D-Proton use customBorderColorWithoutFormat
197     * and have most of D24S8 images with USAGE_SAMPLED, in such case we
198     * disable UBWC for correctness. However, games don't use border color for
199     * depth-stencil images. So we elect to ignore this edge case and force
200     * UBWC to be enabled.
201     */
202    bool disable_d24s8_border_color_workaround;
203 };
204 VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
205                        VK_OBJECT_TYPE_INSTANCE)
206 
207 /* This struct defines the layout of the global_bo */
208 struct tu6_global
209 {
210    /* clear/blit shaders */
211    uint32_t shaders[TU_BLIT_SHADER_SIZE];
212 
213    uint32_t seqno_dummy;          /* dummy seqno for CP_EVENT_WRITE */
214    uint32_t _pad0;
215    volatile uint32_t vsc_draw_overflow;
216    uint32_t _pad1;
217    volatile uint32_t vsc_prim_overflow;
218    uint32_t _pad2;
219    uint64_t predicate;
220 
221    /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */
222    struct {
223       uint32_t offset;
224       uint32_t pad[7];
225    } flush_base[4];
226 
227    alignas(16) uint32_t cs_indirect_xyz[12];
228 
229    uint32_t vsc_state[32];
230 
231    volatile uint32_t vtx_stats_query_not_running;
232 
233    /* To know when renderpass stats for autotune are valid */
234    volatile uint32_t autotune_fence;
235 
236    /* For recycling command buffers for dynamic suspend/resume comamnds */
237    volatile uint32_t dynamic_rendering_fence;
238 
239    volatile uint32_t dbg_one;
240    volatile uint32_t dbg_gmem_total_loads;
241    volatile uint32_t dbg_gmem_taken_loads;
242    volatile uint32_t dbg_gmem_total_stores;
243    volatile uint32_t dbg_gmem_taken_stores;
244 
245    /* Written from GPU */
246    volatile uint32_t breadcrumb_gpu_sync_seqno;
247    uint32_t _pad3;
248    /* Written from CPU, acknowledges value written from GPU */
249    volatile uint32_t breadcrumb_cpu_sync_seqno;
250    uint32_t _pad4;
251 
252    volatile uint32_t userspace_fence;
253    uint32_t _pad5;
254 
255    /* note: larger global bo will be used for customBorderColors */
256    struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[];
257 };
258 #define gb_offset(member) offsetof(struct tu6_global, member)
259 #define global_iova(cmd, member) ((cmd)->device->global_bo->iova + gb_offset(member))
260 #define global_iova_arr(cmd, member, idx)                                    \
261    (global_iova(cmd, member) + sizeof_field(struct tu6_global, member[0]) * (idx))
262 
263 struct tu_pvtmem_bo {
264       mtx_t mtx;
265       struct tu_bo *bo;
266       uint32_t per_fiber_size, per_sp_size;
267 };
268 
269 struct tu_virtio_device;
270 struct tu_queue;
271 
272 struct tu_device
273 {
274    struct vk_device vk;
275    struct tu_instance *instance;
276 
277    struct tu_queue *queues[TU_MAX_QUEUE_FAMILIES];
278    int queue_count[TU_MAX_QUEUE_FAMILIES];
279 
280    struct tu_physical_device *physical_device;
281    uint32_t device_idx;
282    int fd;
283 
284    struct ir3_compiler *compiler;
285 
286    /* Backup in-memory cache to be used if the app doesn't provide one */
287    struct vk_pipeline_cache *mem_cache;
288 
289 #define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */
290 
291    /* Currently the kernel driver uses a 32-bit GPU address space, but it
292     * should be impossible to go beyond 48 bits.
293     */
294    struct {
295       struct tu_bo *bo;
296       mtx_t construct_mtx;
297       bool initialized;
298    } scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2];
299 
300    struct tu_pvtmem_bo fiber_pvtmem_bo, wave_pvtmem_bo;
301 
302    struct tu_bo *global_bo;
303    struct tu6_global *global_bo_map;
304 
305    uint32_t implicit_sync_bo_count;
306 
307    /* Device-global BO suballocator for reducing BO management overhead for
308     * (read-only) pipeline state.  Synchronized by pipeline_mutex.
309     */
310    struct tu_suballocator pipeline_suballoc;
311    mtx_t pipeline_mutex;
312 
313    /* Device-global BO suballocator for reducing BO management for small
314     * gmem/sysmem autotune result buffers.  Synchronized by autotune_mutex.
315     */
316    struct tu_suballocator autotune_suballoc;
317    mtx_t autotune_mutex;
318 
319    /* KGSL requires a small chunk of GPU mem to retrieve raw GPU time on
320     * each submission.
321     */
322    struct tu_suballocator kgsl_profiling_suballoc;
323    mtx_t kgsl_profiling_mutex;
324 
325    /* the blob seems to always use 8K factor and 128K param sizes, copy them */
326 #define TU_TESS_FACTOR_SIZE (8 * 1024)
327 #define TU_TESS_PARAM_SIZE (128 * 1024)
328 #define TU_TESS_BO_SIZE (TU_TESS_FACTOR_SIZE + TU_TESS_PARAM_SIZE)
329    /* Lazily allocated, protected by the device mutex. */
330    struct tu_bo *tess_bo;
331 
332    struct ir3_shader_variant *global_shader_variants[GLOBAL_SH_COUNT];
333    struct ir3_shader *global_shaders[GLOBAL_SH_COUNT];
334    uint64_t global_shader_va[GLOBAL_SH_COUNT];
335 
336    struct tu_shader *empty_tcs, *empty_tes, *empty_gs, *empty_fs, *empty_fs_fdm;
337 
338    uint32_t vsc_draw_strm_pitch;
339    uint32_t vsc_prim_strm_pitch;
340    BITSET_DECLARE(custom_border_color, TU_BORDER_COLOR_COUNT);
341    mtx_t mutex;
342 
343    mtx_t vma_mutex;
344    struct util_vma_heap vma;
345 
346    /* bo list for submits: */
347    struct drm_msm_gem_submit_bo *submit_bo_list;
348    /* map bo handles to bo list index: */
349    uint32_t submit_bo_count, submit_bo_list_size;
350    /* bo list for dumping: */
351    struct util_dynarray dump_bo_list;
352    mtx_t bo_mutex;
353    /* protects imported BOs creation/freeing */
354    struct u_rwlock dma_bo_lock;
355 
356    /* Tracking of name -> size allocated for TU_DEBUG_BOS */
357    struct hash_table *bo_sizes;
358 
359    /* This array holds all our 'struct tu_bo' allocations. We use this
360     * so we can add a refcount to our BOs and check if a particular BO
361     * was already allocated in this device using its GEM handle. This is
362     * necessary to properly manage BO imports, because the kernel doesn't
363     * refcount the underlying BO memory.
364     *
365     * Specifically, when self-importing (i.e. importing a BO into the same
366     * device that created it), the kernel will give us the same BO handle
367     * for both BOs and we must only free it once when  both references are
368     * freed. Otherwise, if we are not self-importing, we get two different BO
369     * handles, and we want to free each one individually.
370     *
371     * The refcount is also useful for being able to maintain BOs across
372     * VK object lifetimes, such as pipelines suballocating out of BOs
373     * allocated on the device.
374     */
375    struct util_sparse_array bo_map;
376 
377    /* We cannot immediately free VMA when freeing BO, kernel truly
378     * frees BO when it stops being busy.
379     * So we have to free our VMA only after the kernel does it.
380     */
381    struct u_vector zombie_vmas;
382 
383    struct tu_cs sub_cs;
384 
385    /* Command streams to set pass index to a scratch reg */
386    struct tu_cs_entry *perfcntrs_pass_cs_entries;
387 
388    struct tu_cs_entry cmdbuf_start_a725_quirk_entry;
389 
390    struct tu_cs_entry bin_preamble_entry;
391 
392    struct util_dynarray dynamic_rendering_pending;
393    VkCommandPool dynamic_rendering_pool;
394    uint32_t dynamic_rendering_fence;
395 
396    /* Condition variable for timeline semaphore to notify waiters when a
397     * new submit is executed. */
398    pthread_cond_t timeline_cond;
399    pthread_mutex_t submit_mutex;
400 
401    struct tu_autotune autotune;
402 
403    struct breadcrumbs_context *breadcrumbs_ctx;
404 
405    struct tu_cs *dbg_cmdbuf_stomp_cs;
406    struct tu_cs *dbg_renderpass_stomp_cs;
407 
408 #ifdef TU_HAS_VIRTIO
409    struct tu_virtio_device *vdev;
410 #endif
411 
412    uint32_t submit_count;
413 
414    /* Address space and global fault count for this local_fd with DRM backend */
415    uint64_t fault_count;
416 
417    struct u_trace_context trace_context;
418 
419    #ifdef HAVE_PERFETTO
420    struct tu_perfetto_state perfetto;
421    #endif
422 
423    bool use_z24uint_s8uint;
424    bool use_lrz;
425 
426    struct fd_rd_output rd_output;
427 };
428 VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
429 
430 struct tu_device_memory
431 {
432    struct vk_device_memory vk;
433 
434    struct tu_bo *bo;
435 
436    /* for dedicated allocations */
437    struct tu_image *image;
438 };
439 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, vk.base, VkDeviceMemory,
440                                VK_OBJECT_TYPE_DEVICE_MEMORY)
441 
442 struct tu_attachment_info
443 {
444    struct tu_image_view *attachment;
445 };
446 
447 struct tu_tiling_config {
448    /* size of the first tile */
449    VkExtent2D tile0;
450    /* number of tiles */
451    VkExtent2D tile_count;
452 
453    /* size of the first VSC pipe */
454    VkExtent2D pipe0;
455    /* number of VSC pipes */
456    VkExtent2D pipe_count;
457 
458    /* Whether using GMEM is even possible with this configuration */
459    bool possible;
460 
461    /* Whether binning should be used for gmem rendering using this framebuffer. */
462    bool binning;
463 
464    /* Whether binning could be used for gmem rendering using this framebuffer. */
465    bool binning_possible;
466 
467    /* pipe register values */
468    uint32_t pipe_config[MAX_VSC_PIPES];
469    uint32_t pipe_sizes[MAX_VSC_PIPES];
470 };
471 
472 struct tu_framebuffer
473 {
474    struct vk_object_base base;
475 
476    uint32_t width;
477    uint32_t height;
478    uint32_t layers;
479 
480    struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT];
481 
482    uint32_t attachment_count;
483    struct tu_attachment_info attachments[0];
484 };
485 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, base, VkFramebuffer,
486                                VK_OBJECT_TYPE_FRAMEBUFFER)
487 
488 uint64_t
489 tu_get_system_heap_size(struct tu_physical_device *physical_device);
490 
491 VkResult
492 tu_physical_device_init(struct tu_physical_device *device,
493                         struct tu_instance *instance);
494 
495 void
496 tu_physical_device_get_global_priority_properties(const struct tu_physical_device *pdevice,
497                                                   VkQueueFamilyGlobalPriorityPropertiesKHR *props);
498 
499 uint64_t
500 tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
501 
502 static inline struct tu_bo *
tu_device_lookup_bo(struct tu_device * device,uint32_t handle)503 tu_device_lookup_bo(struct tu_device *device, uint32_t handle)
504 {
505    return (struct tu_bo *) util_sparse_array_get(&device->bo_map, handle);
506 }
507 
508 struct u_trace_context *
509 tu_device_get_u_trace(struct tu_device *device);
510 
511 /* Get a scratch bo for use inside a command buffer. This will always return
512  * the same bo given the same size or similar sizes, so only one scratch bo
513  * can be used at the same time. It's meant for short-lived things where we
514  * need to write to some piece of memory, read from it, and then immediately
515  * discard it.
516  */
517 VkResult
518 tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo);
519 
520 void tu_setup_dynamic_framebuffer(struct tu_cmd_buffer *cmd_buffer,
521                                   const VkRenderingInfo *pRenderingInfo);
522 
523 void
524 tu_copy_buffer(struct u_trace_context *utctx, void *cmdstream,
525                void *ts_from, uint64_t from_offset_B,
526                void *ts_to, uint64_t to_offset_B,
527                uint64_t size_B);
528 
529 
530 VkResult
531 tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
532                             struct u_trace **trace_copy);
533 
534 /* If we copy trace and timestamps we will have to free them. */
535 struct tu_u_trace_cmd_data
536 {
537    struct tu_cs *timestamp_copy_cs;
538    struct u_trace *trace;
539 };
540 
541 /* Data necessary to retrieve timestamps and clean all
542  * associated resources afterwards.
543  */
544 struct tu_u_trace_submission_data
545 {
546    uint32_t submission_id;
547 
548    /* We have to know when timestamps are available,
549     * this queue and fence indicates it.
550     */
551    struct tu_queue *queue;
552    uint32_t fence;
553 
554    uint32_t cmd_buffer_count;
555    uint32_t last_buffer_with_tracepoints;
556    struct tu_u_trace_cmd_data *cmd_trace_data;
557 
558    /* GPU time is reset on GPU power cycle and the GPU time
559     * offset may change between submissions due to power cycle.
560     */
561    uint64_t gpu_ts_offset;
562 
563    /* KGSL needs a GPU memory to write submission timestamps into */
564    struct tu_suballoc_bo kgsl_timestamp_bo;
565 };
566 
567 VkResult
568 tu_u_trace_submission_data_create(
569    struct tu_device *device,
570    struct tu_cmd_buffer **cmd_buffers,
571    uint32_t cmd_buffer_count,
572    struct tu_u_trace_submission_data **submission_data);
573 
574 void
575 tu_u_trace_submission_data_finish(
576    struct tu_device *device,
577    struct tu_u_trace_submission_data *submission_data);
578 
579 const char *
580 tu_debug_bos_add(struct tu_device *dev, uint64_t size, const char *name);
581 void
582 tu_debug_bos_del(struct tu_device *dev, struct tu_bo *bo);
583 void
584 tu_debug_bos_print_stats(struct tu_device *dev);
585 
586 void
587 tu_dump_bo_init(struct tu_device *dev, struct tu_bo *bo);
588 void
589 tu_dump_bo_del(struct tu_device *dev, struct tu_bo *bo);
590 
591 
592 #endif /* TU_DEVICE_H */
593