• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  * SPDX-License-Identifier: MIT
5  *
6  * based in part on anv driver which is:
7  * Copyright © 2015 Intel Corporation
8  */
9 
10 #ifndef TU_DEVICE_H
11 #define TU_DEVICE_H
12 
13 #include "tu_common.h"
14 
15 #include "vk_buffer.h"
16 
17 #include "tu_autotune.h"
18 #include "tu_pass.h"
19 #include "tu_perfetto.h"
20 #include "tu_suballoc.h"
21 #include "tu_util.h"
22 
23 #include "common/freedreno_rd_output.h"
24 #include "util/vma.h"
25 #include "util/u_vector.h"
26 
27 /* queue types */
28 #define TU_QUEUE_GENERAL 0
29 
30 #define TU_MAX_QUEUE_FAMILIES 1
31 
32 #define TU_BORDER_COLOR_COUNT 4096
33 #define TU_BORDER_COLOR_BUILTIN 6
34 
35 #define TU_BLIT_SHADER_SIZE 4096
36 
37 /* extra space in vsc draw/prim streams */
38 #define VSC_PAD 0x40
39 
40 enum global_shader {
41    GLOBAL_SH_VS_BLIT,
42    GLOBAL_SH_VS_CLEAR,
43    GLOBAL_SH_FS_BLIT,
44    GLOBAL_SH_FS_BLIT_ZSCALE,
45    GLOBAL_SH_FS_COPY_MS,
46    GLOBAL_SH_FS_COPY_MS_HALF,
47    GLOBAL_SH_FS_CLEAR0,
48    GLOBAL_SH_FS_CLEAR_MAX = GLOBAL_SH_FS_CLEAR0 + MAX_RTS,
49    GLOBAL_SH_COUNT,
50 };
51 
52 struct tu_memory_heap {
53    /* Standard bits passed on to the client */
54    VkDeviceSize      size;
55    VkMemoryHeapFlags flags;
56 
57    /** Copied from ANV:
58     *
59     * Driver-internal book-keeping.
60     *
61     * Align it to 64 bits to make atomic operations faster on 32 bit platforms.
62     */
63    alignas(8) VkDeviceSize used;
64 };
65 
66 struct tu_physical_device
67 {
68    struct vk_physical_device vk;
69 
70    struct tu_instance *instance;
71 
72    const char *name;
73    uint8_t driver_uuid[VK_UUID_SIZE];
74    uint8_t device_uuid[VK_UUID_SIZE];
75    uint8_t cache_uuid[VK_UUID_SIZE];
76 
77    struct wsi_device wsi_device;
78 
79    char fd_path[20];
80    int local_fd;
81    bool has_local;
82    int64_t local_major;
83    int64_t local_minor;
84    int master_fd;
85    bool has_master;
86    int64_t master_major;
87    int64_t master_minor;
88 
89    uint32_t gmem_size;
90    uint64_t gmem_base;
91 
92    uint32_t usable_gmem_size_gmem;
93    uint32_t ccu_offset_gmem;
94    uint32_t ccu_offset_bypass;
95    uint32_t ccu_depth_offset_bypass;
96    uint32_t vpc_attr_buf_offset_gmem;
97    uint32_t vpc_attr_buf_size_gmem;
98    uint32_t vpc_attr_buf_offset_bypass;
99    uint32_t vpc_attr_buf_size_bypass;
100 
101    /* Amount of usable descriptor sets, this excludes any reserved set */
102    uint32_t usable_sets;
103    /* Index of the reserved descriptor set, may be -1 if unset */
104    int32_t reserved_set_idx;
105 
106    bool has_set_iova;
107    uint64_t va_start;
108    uint64_t va_size;
109 
110    bool has_cached_coherent_memory;
111    bool has_cached_non_coherent_memory;
112    uintptr_t level1_dcache_size;
113 
114    struct {
115       uint32_t type_count;
116       VkMemoryPropertyFlags types[VK_MAX_MEMORY_TYPES];
117    } memory;
118 
119    struct fd_dev_id dev_id;
120    struct fd_dev_info dev_info;
121    const struct fd_dev_info *info;
122 
123    int msm_major_version;
124    int msm_minor_version;
125 
126    /* with 0 being the highest priority */
127    uint32_t submitqueue_priority_count;
128 
129    struct tu_memory_heap heap;
130 
131    struct vk_sync_type syncobj_type;
132    struct vk_sync_timeline_type timeline_type;
133    const struct vk_sync_type *sync_types[3];
134 
135    uint32_t device_count;
136 };
137 VK_DEFINE_HANDLE_CASTS(tu_physical_device, vk.base, VkPhysicalDevice,
138                        VK_OBJECT_TYPE_PHYSICAL_DEVICE)
139 
140 struct tu_knl;
141 
142 struct tu_instance
143 {
144    struct vk_instance vk;
145 
146    const struct tu_knl *knl;
147 
148    uint32_t api_version;
149 
150    struct driOptionCache dri_options;
151    struct driOptionCache available_dri_options;
152 
153    bool dont_care_as_load;
154 
155    /* Conservative LRZ (default true) invalidates LRZ on draws with
156     * blend and depth-write enabled, because this can lead to incorrect
157     * rendering.  Driconf can be used to disable conservative LRZ for
158     * games which do not have the problematic sequence of draws *and*
159     * suffer a performance loss with conservative LRZ.
160     */
161    bool conservative_lrz;
162 
163    /* If to internally reserve a descriptor set for descriptor set
164     * dynamic offsets, a descriptor set can be freed at the cost of
165     * being unable to use the feature. As it is a part of the Vulkan
166     * core, this is enabled by default.
167     */
168    bool reserve_descriptor_set;
169 
170    /* Allow out of bounds UBO access by disabling lowering of UBO loads for
171     * indirect access, which rely on the UBO bounds specified in the shader,
172     * rather than the bound UBO size which isn't known until draw time.
173     *
174     * See: https://github.com/doitsujin/dxvk/issues/3861
175     */
176    bool allow_oob_indirect_ubo_loads;
177 };
178 VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
179                        VK_OBJECT_TYPE_INSTANCE)
180 
181 struct tu_queue
182 {
183    struct vk_queue vk;
184 
185    struct tu_device *device;
186 
187    uint32_t msm_queue_id;
188    uint32_t priority;
189 
190    int fence;           /* timestamp/fence of the last queue submission */
191 };
192 VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
193 
194 /* This struct defines the layout of the global_bo */
195 struct tu6_global
196 {
197    /* clear/blit shaders */
198    uint32_t shaders[TU_BLIT_SHADER_SIZE];
199 
200    uint32_t seqno_dummy;          /* dummy seqno for CP_EVENT_WRITE */
201    uint32_t _pad0;
202    volatile uint32_t vsc_draw_overflow;
203    uint32_t _pad1;
204    volatile uint32_t vsc_prim_overflow;
205    uint32_t _pad2;
206    uint64_t predicate;
207 
208    /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */
209    struct {
210       uint32_t offset;
211       uint32_t pad[7];
212    } flush_base[4];
213 
214    alignas(16) uint32_t cs_indirect_xyz[12];
215 
216    volatile uint32_t vtx_stats_query_not_running;
217 
218    /* To know when renderpass stats for autotune are valid */
219    volatile uint32_t autotune_fence;
220 
221    /* For recycling command buffers for dynamic suspend/resume comamnds */
222    volatile uint32_t dynamic_rendering_fence;
223 
224    volatile uint32_t dbg_one;
225    volatile uint32_t dbg_gmem_total_loads;
226    volatile uint32_t dbg_gmem_taken_loads;
227    volatile uint32_t dbg_gmem_total_stores;
228    volatile uint32_t dbg_gmem_taken_stores;
229 
230    /* Written from GPU */
231    volatile uint32_t breadcrumb_gpu_sync_seqno;
232    uint32_t _pad3;
233    /* Written from CPU, acknowledges value written from GPU */
234    volatile uint32_t breadcrumb_cpu_sync_seqno;
235    uint32_t _pad4;
236 
237    volatile uint32_t userspace_fence;
238    uint32_t _pad5;
239 
240    /* note: larger global bo will be used for customBorderColors */
241    struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[];
242 };
243 #define gb_offset(member) offsetof(struct tu6_global, member)
244 #define global_iova(cmd, member) ((cmd)->device->global_bo->iova + gb_offset(member))
245 #define global_iova_arr(cmd, member, idx)                                    \
246    (global_iova(cmd, member) + sizeof_field(struct tu6_global, member[0]) * (idx))
247 
248 struct tu_pvtmem_bo {
249       mtx_t mtx;
250       struct tu_bo *bo;
251       uint32_t per_fiber_size, per_sp_size;
252 };
253 
254 #if DETECT_OS_ANDROID
255 enum tu_gralloc_type
256 {
257    TU_GRALLOC_UNKNOWN,
258    TU_GRALLOC_CROS,
259    TU_GRALLOC_OTHER,
260 };
261 #endif
262 
263 struct tu_virtio_device;
264 
265 struct tu_device
266 {
267    struct vk_device vk;
268    struct tu_instance *instance;
269 
270    struct tu_queue *queues[TU_MAX_QUEUE_FAMILIES];
271    int queue_count[TU_MAX_QUEUE_FAMILIES];
272 
273    struct tu_physical_device *physical_device;
274    uint32_t device_idx;
275    int fd;
276 
277    struct ir3_compiler *compiler;
278 
279    /* Backup in-memory cache to be used if the app doesn't provide one */
280    struct vk_pipeline_cache *mem_cache;
281 
282 #define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */
283 
284    /* Currently the kernel driver uses a 32-bit GPU address space, but it
285     * should be impossible to go beyond 48 bits.
286     */
287    struct {
288       struct tu_bo *bo;
289       mtx_t construct_mtx;
290       bool initialized;
291    } scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2];
292 
293    struct tu_pvtmem_bo fiber_pvtmem_bo, wave_pvtmem_bo;
294 
295    struct tu_bo *global_bo;
296    struct tu6_global *global_bo_map;
297 
298    uint32_t implicit_sync_bo_count;
299 
300    /* Device-global BO suballocator for reducing BO management overhead for
301     * (read-only) pipeline state.  Synchronized by pipeline_mutex.
302     */
303    struct tu_suballocator pipeline_suballoc;
304    mtx_t pipeline_mutex;
305 
306    /* Device-global BO suballocator for reducing BO management for small
307     * gmem/sysmem autotune result buffers.  Synchronized by autotune_mutex.
308     */
309    struct tu_suballocator autotune_suballoc;
310    mtx_t autotune_mutex;
311 
312    /* KGSL requires a small chunk of GPU mem to retrieve raw GPU time on
313     * each submission.
314     */
315    struct tu_suballocator kgsl_profiling_suballoc;
316    mtx_t kgsl_profiling_mutex;
317 
318    /* the blob seems to always use 8K factor and 128K param sizes, copy them */
319 #define TU_TESS_FACTOR_SIZE (8 * 1024)
320 #define TU_TESS_PARAM_SIZE (128 * 1024)
321 #define TU_TESS_BO_SIZE (TU_TESS_FACTOR_SIZE + TU_TESS_PARAM_SIZE)
322    /* Lazily allocated, protected by the device mutex. */
323    struct tu_bo *tess_bo;
324 
325    struct ir3_shader_variant *global_shader_variants[GLOBAL_SH_COUNT];
326    struct ir3_shader *global_shaders[GLOBAL_SH_COUNT];
327    uint64_t global_shader_va[GLOBAL_SH_COUNT];
328 
329    struct tu_shader *empty_tcs, *empty_tes, *empty_gs, *empty_fs, *empty_fs_fdm;
330 
331    uint32_t vsc_draw_strm_pitch;
332    uint32_t vsc_prim_strm_pitch;
333    BITSET_DECLARE(custom_border_color, TU_BORDER_COLOR_COUNT);
334    mtx_t mutex;
335 
336    mtx_t vma_mutex;
337    struct util_vma_heap vma;
338 
339    /* bo list for submits: */
340    struct drm_msm_gem_submit_bo *bo_list;
341    /* map bo handles to bo list index: */
342    uint32_t bo_count, bo_list_size;
343    mtx_t bo_mutex;
344    /* protects imported BOs creation/freeing */
345    struct u_rwlock dma_bo_lock;
346 
347    /* Tracking of name -> size allocated for TU_DEBUG_BOS */
348    struct hash_table *bo_sizes;
349 
350    /* This array holds all our 'struct tu_bo' allocations. We use this
351     * so we can add a refcount to our BOs and check if a particular BO
352     * was already allocated in this device using its GEM handle. This is
353     * necessary to properly manage BO imports, because the kernel doesn't
354     * refcount the underlying BO memory.
355     *
356     * Specifically, when self-importing (i.e. importing a BO into the same
357     * device that created it), the kernel will give us the same BO handle
358     * for both BOs and we must only free it once when  both references are
359     * freed. Otherwise, if we are not self-importing, we get two different BO
360     * handles, and we want to free each one individually.
361     *
362     * The refcount is also useful for being able to maintain BOs across
363     * VK object lifetimes, such as pipelines suballocating out of BOs
364     * allocated on the device.
365     */
366    struct util_sparse_array bo_map;
367 
368    /* We cannot immediately free VMA when freeing BO, kernel truly
369     * frees BO when it stops being busy.
370     * So we have to free our VMA only after the kernel does it.
371     */
372    struct u_vector zombie_vmas;
373 
374    /* Command streams to set pass index to a scratch reg */
375    struct tu_cs *perfcntrs_pass_cs;
376    struct tu_cs_entry *perfcntrs_pass_cs_entries;
377 
378    struct tu_cs *cmdbuf_start_a725_quirk_cs;
379    struct tu_cs_entry *cmdbuf_start_a725_quirk_entry;
380 
381    struct util_dynarray dynamic_rendering_pending;
382    VkCommandPool dynamic_rendering_pool;
383    uint32_t dynamic_rendering_fence;
384 
385    /* Condition variable for timeline semaphore to notify waiters when a
386     * new submit is executed. */
387    pthread_cond_t timeline_cond;
388    pthread_mutex_t submit_mutex;
389 
390    struct tu_autotune autotune;
391 
392    struct breadcrumbs_context *breadcrumbs_ctx;
393 
394    struct tu_cs *dbg_cmdbuf_stomp_cs;
395    struct tu_cs *dbg_renderpass_stomp_cs;
396 
397 #if DETECT_OS_ANDROID
398    const void *gralloc;
399    enum tu_gralloc_type gralloc_type;
400 #endif
401 
402 #ifdef TU_HAS_VIRTIO
403    struct tu_virtio_device *vdev;
404 #endif
405 
406    uint32_t submit_count;
407 
408    /* Address space and global fault count for this local_fd with DRM backend */
409    uint64_t fault_count;
410 
411    struct u_trace_context trace_context;
412 
413    #ifdef HAVE_PERFETTO
414    struct tu_perfetto_state perfetto;
415    #endif
416 
417    bool use_z24uint_s8uint;
418    bool use_lrz;
419 
420    struct fd_rd_output rd_output;
421 };
422 VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
423 
424 struct tu_device_memory
425 {
426    struct vk_object_base base;
427 
428    struct tu_bo *bo;
429 
430    /* for dedicated allocations */
431    struct tu_image *image;
432 };
433 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, base, VkDeviceMemory,
434                                VK_OBJECT_TYPE_DEVICE_MEMORY)
435 
436 struct tu_buffer
437 {
438    struct vk_buffer vk;
439 
440    struct tu_bo *bo;
441    uint64_t iova;
442 };
443 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, vk.base, VkBuffer,
444                                VK_OBJECT_TYPE_BUFFER)
445 
446 struct tu_attachment_info
447 {
448    struct tu_image_view *attachment;
449 };
450 
451 struct tu_tiling_config {
452    /* size of the first tile */
453    VkExtent2D tile0;
454    /* number of tiles */
455    VkExtent2D tile_count;
456 
457    /* size of the first VSC pipe */
458    VkExtent2D pipe0;
459    /* number of VSC pipes */
460    VkExtent2D pipe_count;
461 
462    /* Whether using GMEM is even possible with this configuration */
463    bool possible;
464 
465    /* Whether binning should be used for gmem rendering using this framebuffer. */
466    bool binning;
467 
468    /* Whether binning could be used for gmem rendering using this framebuffer. */
469    bool binning_possible;
470 
471    /* pipe register values */
472    uint32_t pipe_config[MAX_VSC_PIPES];
473    uint32_t pipe_sizes[MAX_VSC_PIPES];
474 };
475 
476 struct tu_framebuffer
477 {
478    struct vk_object_base base;
479 
480    uint32_t width;
481    uint32_t height;
482    uint32_t layers;
483 
484    struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT];
485 
486    uint32_t attachment_count;
487    struct tu_attachment_info attachments[0];
488 };
489 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, base, VkFramebuffer,
490                                VK_OBJECT_TYPE_FRAMEBUFFER)
491 
492 struct tu_event
493 {
494    struct vk_object_base base;
495    struct tu_bo *bo;
496 };
497 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
498 
499 struct tu_sampler {
500    struct vk_object_base base;
501 
502    uint32_t descriptor[A6XX_TEX_SAMP_DWORDS];
503    struct tu_sampler_ycbcr_conversion *ycbcr_sampler;
504 };
505 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, base, VkSampler,
506                                VK_OBJECT_TYPE_SAMPLER)
507 
508 uint64_t
509 tu_get_system_heap_size(struct tu_physical_device *physical_device);
510 
511 VkResult
512 tu_physical_device_init(struct tu_physical_device *device,
513                         struct tu_instance *instance);
514 
515 uint64_t
516 tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
517 
518 static inline struct tu_bo *
tu_device_lookup_bo(struct tu_device * device,uint32_t handle)519 tu_device_lookup_bo(struct tu_device *device, uint32_t handle)
520 {
521    return (struct tu_bo *) util_sparse_array_get(&device->bo_map, handle);
522 }
523 
524 struct u_trace_context *
525 tu_device_get_u_trace(struct tu_device *device);
526 
527 /* Get a scratch bo for use inside a command buffer. This will always return
528  * the same bo given the same size or similar sizes, so only one scratch bo
529  * can be used at the same time. It's meant for short-lived things where we
530  * need to write to some piece of memory, read from it, and then immediately
531  * discard it.
532  */
533 VkResult
534 tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo);
535 
536 void tu_setup_dynamic_framebuffer(struct tu_cmd_buffer *cmd_buffer,
537                                   const VkRenderingInfo *pRenderingInfo);
538 
539 void
540 tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream,
541                          void *ts_from, uint32_t from_offset,
542                          void *ts_to, uint32_t to_offset,
543                          uint32_t count);
544 
545 
546 VkResult
547 tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
548                             struct u_trace **trace_copy);
549 
550 /* If we copy trace and timestamps we will have to free them. */
551 struct tu_u_trace_cmd_data
552 {
553    struct tu_cs *timestamp_copy_cs;
554    struct u_trace *trace;
555 };
556 
557 /* Data necessary to retrieve timestamps and clean all
558  * associated resources afterwards.
559  */
560 struct tu_u_trace_submission_data
561 {
562    uint32_t submission_id;
563    /* We have to know when timestamps are available,
564     * this sync object indicates it.
565     */
566    struct tu_u_trace_syncobj *syncobj;
567 
568    uint32_t cmd_buffer_count;
569    uint32_t last_buffer_with_tracepoints;
570    struct tu_u_trace_cmd_data *cmd_trace_data;
571 
572    /* GPU time is reset on GPU power cycle and the GPU time
573     * offset may change between submissions due to power cycle.
574     */
575    uint64_t gpu_ts_offset;
576 
577    /* KGSL needs a GPU memory to write submission timestamps into */
578    struct tu_suballoc_bo kgsl_timestamp_bo;
579 };
580 
581 VkResult
582 tu_u_trace_submission_data_create(
583    struct tu_device *device,
584    struct tu_cmd_buffer **cmd_buffers,
585    uint32_t cmd_buffer_count,
586    struct tu_u_trace_submission_data **submission_data);
587 
588 void
589 tu_u_trace_submission_data_finish(
590    struct tu_device *device,
591    struct tu_u_trace_submission_data *submission_data);
592 
593 const char *
594 tu_debug_bos_add(struct tu_device *dev, uint64_t size, const char *name);
595 void
596 tu_debug_bos_del(struct tu_device *dev, struct tu_bo *bo);
597 void
598 tu_debug_bos_print_stats(struct tu_device *dev);
599 
600 #endif /* TU_DEVICE_H */
601