• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  * SPDX-License-Identifier: MIT
5  *
6  * based in part on anv driver which is:
7  * Copyright © 2015 Intel Corporation
8  */
9 
10 #ifndef TU_DEVICE_H
11 #define TU_DEVICE_H
12 
13 #include "tu_common.h"
14 
15 #include "tu_autotune.h"
16 #include "tu_pass.h"
17 #include "tu_perfetto.h"
18 #include "tu_suballoc.h"
19 #include "tu_util.h"
20 
21 /* queue types */
22 #define TU_QUEUE_GENERAL 0
23 
24 #define TU_MAX_QUEUE_FAMILIES 1
25 
26 #define TU_BORDER_COLOR_COUNT 4096
27 #define TU_BORDER_COLOR_BUILTIN 6
28 
29 #define TU_BLIT_SHADER_SIZE 1024
30 
31 /* extra space in vsc draw/prim streams */
32 #define VSC_PAD 0x40
33 
34 enum tu_debug_flags
35 {
36    TU_DEBUG_STARTUP = 1 << 0,
37    TU_DEBUG_NIR = 1 << 1,
38    TU_DEBUG_NOBIN = 1 << 3,
39    TU_DEBUG_SYSMEM = 1 << 4,
40    TU_DEBUG_FORCEBIN = 1 << 5,
41    TU_DEBUG_NOUBWC = 1 << 6,
42    TU_DEBUG_NOMULTIPOS = 1 << 7,
43    TU_DEBUG_NOLRZ = 1 << 8,
44    TU_DEBUG_PERFC = 1 << 9,
45    TU_DEBUG_FLUSHALL = 1 << 10,
46    TU_DEBUG_SYNCDRAW = 1 << 11,
47    TU_DEBUG_DONT_CARE_AS_LOAD = 1 << 12,
48    TU_DEBUG_GMEM = 1 << 13,
49    TU_DEBUG_RAST_ORDER = 1 << 14,
50    TU_DEBUG_UNALIGNED_STORE = 1 << 15,
51    TU_DEBUG_LAYOUT = 1 << 16,
52    TU_DEBUG_LOG_SKIP_GMEM_OPS = 1 << 17,
53    TU_DEBUG_PERF = 1 << 18,
54    TU_DEBUG_NOLRZFC = 1 << 19,
55    TU_DEBUG_DYNAMIC = 1 << 20,
56 };
57 
58 enum global_shader {
59    GLOBAL_SH_VS_BLIT,
60    GLOBAL_SH_VS_CLEAR,
61    GLOBAL_SH_FS_BLIT,
62    GLOBAL_SH_FS_BLIT_ZSCALE,
63    GLOBAL_SH_FS_COPY_MS,
64    GLOBAL_SH_FS_CLEAR0,
65    GLOBAL_SH_FS_CLEAR_MAX = GLOBAL_SH_FS_CLEAR0 + MAX_RTS,
66    GLOBAL_SH_COUNT,
67 };
68 
69 struct tu_memory_heap {
70    /* Standard bits passed on to the client */
71    VkDeviceSize      size;
72    VkMemoryHeapFlags flags;
73 
74    /** Copied from ANV:
75     *
76     * Driver-internal book-keeping.
77     *
78     * Align it to 64 bits to make atomic operations faster on 32 bit platforms.
79     */
80    VkDeviceSize      used __attribute__ ((aligned (8)));
81 };
82 
83 struct tu_physical_device
84 {
85    struct vk_physical_device vk;
86 
87    struct tu_instance *instance;
88 
89    const char *name;
90    uint8_t driver_uuid[VK_UUID_SIZE];
91    uint8_t device_uuid[VK_UUID_SIZE];
92    uint8_t cache_uuid[VK_UUID_SIZE];
93 
94    struct wsi_device wsi_device;
95 
96    int local_fd;
97    bool has_local;
98    int64_t local_major;
99    int64_t local_minor;
100    int master_fd;
101    bool has_master;
102    int64_t master_major;
103    int64_t master_minor;
104 
105    uint32_t gmem_size;
106    uint64_t gmem_base;
107    uint32_t ccu_offset_gmem;
108    uint32_t ccu_offset_bypass;
109 
110    struct fd_dev_id dev_id;
111    const struct fd_dev_info *info;
112 
113    int msm_major_version;
114    int msm_minor_version;
115 
116    /* Address space and global fault count for this local_fd with DRM backend */
117    uint64_t fault_count;
118 
119    struct tu_memory_heap heap;
120 
121    struct vk_sync_type syncobj_type;
122    struct vk_sync_timeline_type timeline_type;
123    const struct vk_sync_type *sync_types[3];
124 };
125 VK_DEFINE_HANDLE_CASTS(tu_physical_device, vk.base, VkPhysicalDevice,
126                        VK_OBJECT_TYPE_PHYSICAL_DEVICE)
127 
128 struct tu_instance
129 {
130    struct vk_instance vk;
131 
132    uint32_t api_version;
133    int physical_device_count;
134    struct tu_physical_device physical_devices[TU_MAX_DRM_DEVICES];
135 
136    struct driOptionCache dri_options;
137    struct driOptionCache available_dri_options;
138 
139    enum tu_debug_flags debug_flags;
140 };
141 VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
142                        VK_OBJECT_TYPE_INSTANCE)
143 
144 struct tu_queue
145 {
146    struct vk_queue vk;
147 
148    struct tu_device *device;
149 
150    uint32_t msm_queue_id;
151    int fence;
152 };
153 VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
154 
155 /* This struct defines the layout of the global_bo */
156 struct tu6_global
157 {
158    /* clear/blit shaders */
159    uint32_t shaders[TU_BLIT_SHADER_SIZE];
160 
161    uint32_t seqno_dummy;          /* dummy seqno for CP_EVENT_WRITE */
162    uint32_t _pad0;
163    volatile uint32_t vsc_draw_overflow;
164    uint32_t _pad1;
165    volatile uint32_t vsc_prim_overflow;
166    uint32_t _pad2;
167    uint64_t predicate;
168 
169    /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */
170    struct {
171       uint32_t offset;
172       uint32_t pad[7];
173    } flush_base[4];
174 
175    ALIGN16 uint32_t cs_indirect_xyz[3];
176 
177    volatile uint32_t vtx_stats_query_not_running;
178 
179    /* To know when renderpass stats for autotune are valid */
180    volatile uint32_t autotune_fence;
181 
182    /* For recycling command buffers for dynamic suspend/resume comamnds */
183    volatile uint32_t dynamic_rendering_fence;
184 
185    volatile uint32_t dbg_one;
186    volatile uint32_t dbg_gmem_total_loads;
187    volatile uint32_t dbg_gmem_taken_loads;
188    volatile uint32_t dbg_gmem_total_stores;
189    volatile uint32_t dbg_gmem_taken_stores;
190 
191    /* Written from GPU */
192    volatile uint32_t breadcrumb_gpu_sync_seqno;
193    uint32_t _pad3;
194    /* Written from CPU, acknowledges value written from GPU */
195    volatile uint32_t breadcrumb_cpu_sync_seqno;
196    uint32_t _pad4;
197 
198    /* note: larger global bo will be used for customBorderColors */
199    struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[];
200 };
201 #define gb_offset(member) offsetof(struct tu6_global, member)
202 #define global_iova(cmd, member) ((cmd)->device->global_bo->iova + gb_offset(member))
203 
204 struct tu_device
205 {
206    struct vk_device vk;
207    struct tu_instance *instance;
208 
209    struct tu_queue *queues[TU_MAX_QUEUE_FAMILIES];
210    int queue_count[TU_MAX_QUEUE_FAMILIES];
211 
212    struct tu_physical_device *physical_device;
213    int fd;
214 
215    struct ir3_compiler *compiler;
216 
217    /* Backup in-memory cache to be used if the app doesn't provide one */
218    struct vk_pipeline_cache *mem_cache;
219 
220 #define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */
221 
222    /* Currently the kernel driver uses a 32-bit GPU address space, but it
223     * should be impossible to go beyond 48 bits.
224     */
225    struct {
226       struct tu_bo *bo;
227       mtx_t construct_mtx;
228       bool initialized;
229    } scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2];
230 
231    struct tu_bo *global_bo;
232 
233    uint32_t implicit_sync_bo_count;
234 
235    /* Device-global BO suballocator for reducing BO management overhead for
236     * (read-only) pipeline state.  Synchronized by pipeline_mutex.
237     */
238    struct tu_suballocator pipeline_suballoc;
239    mtx_t pipeline_mutex;
240 
241    /* Device-global BO suballocator for reducing BO management for small
242     * gmem/sysmem autotune result buffers.  Synchronized by autotune_mutex.
243     */
244    struct tu_suballocator autotune_suballoc;
245    mtx_t autotune_mutex;
246 
247    /* the blob seems to always use 8K factor and 128K param sizes, copy them */
248 #define TU_TESS_FACTOR_SIZE (8 * 1024)
249 #define TU_TESS_PARAM_SIZE (128 * 1024)
250 #define TU_TESS_BO_SIZE (TU_TESS_FACTOR_SIZE + TU_TESS_PARAM_SIZE)
251    /* Lazily allocated, protected by the device mutex. */
252    struct tu_bo *tess_bo;
253 
254    struct ir3_shader_variant *global_shader_variants[GLOBAL_SH_COUNT];
255    struct ir3_shader *global_shaders[GLOBAL_SH_COUNT];
256    uint64_t global_shader_va[GLOBAL_SH_COUNT];
257 
258    uint32_t vsc_draw_strm_pitch;
259    uint32_t vsc_prim_strm_pitch;
260    BITSET_DECLARE(custom_border_color, TU_BORDER_COLOR_COUNT);
261    mtx_t mutex;
262 
263    /* bo list for submits: */
264    struct drm_msm_gem_submit_bo *bo_list;
265    /* map bo handles to bo list index: */
266    uint32_t bo_count, bo_list_size;
267    mtx_t bo_mutex;
268    /* protects imported BOs creation/freeing */
269    struct u_rwlock dma_bo_lock;
270 
271    /* This array holds all our 'struct tu_bo' allocations. We use this
272     * so we can add a refcount to our BOs and check if a particular BO
273     * was already allocated in this device using its GEM handle. This is
274     * necessary to properly manage BO imports, because the kernel doesn't
275     * refcount the underlying BO memory.
276     *
277     * Specifically, when self-importing (i.e. importing a BO into the same
278     * device that created it), the kernel will give us the same BO handle
279     * for both BOs and we must only free it once when  both references are
280     * freed. Otherwise, if we are not self-importing, we get two different BO
281     * handles, and we want to free each one individually.
282     *
283     * The refcount is also useful for being able to maintain BOs across
284     * VK object lifetimes, such as pipelines suballocating out of BOs
285     * allocated on the device.
286     */
287    struct util_sparse_array bo_map;
288 
289    /* Command streams to set pass index to a scratch reg */
290    struct tu_cs *perfcntrs_pass_cs;
291    struct tu_cs_entry *perfcntrs_pass_cs_entries;
292 
293    struct util_dynarray dynamic_rendering_pending;
294    VkCommandPool dynamic_rendering_pool;
295    uint32_t dynamic_rendering_fence;
296 
297    /* Condition variable for timeline semaphore to notify waiters when a
298     * new submit is executed. */
299    pthread_cond_t timeline_cond;
300    pthread_mutex_t submit_mutex;
301 
302    struct tu_autotune autotune;
303 
304    struct breadcrumbs_context *breadcrumbs_ctx;
305 
306 #ifdef ANDROID
307    const void *gralloc;
308    enum {
309       TU_GRALLOC_UNKNOWN,
310       TU_GRALLOC_CROS,
311       TU_GRALLOC_OTHER,
312    } gralloc_type;
313 #endif
314 
315    uint32_t submit_count;
316 
317    struct u_trace_context trace_context;
318 
319    #ifdef HAVE_PERFETTO
320    struct tu_perfetto_state perfetto;
321    #endif
322 
323    bool use_z24uint_s8uint;
324 };
325 VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
326 
327 struct tu_device_memory
328 {
329    struct vk_object_base base;
330 
331    struct tu_bo *bo;
332 };
333 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, base, VkDeviceMemory,
334                                VK_OBJECT_TYPE_DEVICE_MEMORY)
335 
336 struct tu_buffer
337 {
338    struct vk_object_base base;
339 
340    VkDeviceSize size;
341 
342    VkBufferUsageFlags usage;
343    VkBufferCreateFlags flags;
344 
345    struct tu_bo *bo;
346    uint64_t iova;
347 };
348 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, base, VkBuffer,
349                                VK_OBJECT_TYPE_BUFFER)
350 
351 struct tu_attachment_info
352 {
353    struct tu_image_view *attachment;
354 };
355 
356 struct tu_tiling_config {
357    /* size of the first tile */
358    VkExtent2D tile0;
359    /* number of tiles */
360    VkExtent2D tile_count;
361 
362    /* size of the first VSC pipe */
363    VkExtent2D pipe0;
364    /* number of VSC pipes */
365    VkExtent2D pipe_count;
366 
367    /* Whether binning should be used for gmem rendering using this framebuffer. */
368    bool binning;
369 
370    /* Whether binning could be used for gmem rendering using this framebuffer. */
371    bool binning_possible;
372 
373    /* pipe register values */
374    uint32_t pipe_config[MAX_VSC_PIPES];
375    uint32_t pipe_sizes[MAX_VSC_PIPES];
376 };
377 
378 struct tu_framebuffer
379 {
380    struct vk_object_base base;
381 
382    uint32_t width;
383    uint32_t height;
384    uint32_t layers;
385 
386    struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT];
387 
388    uint32_t attachment_count;
389    struct tu_attachment_info attachments[0];
390 };
391 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, base, VkFramebuffer,
392                                VK_OBJECT_TYPE_FRAMEBUFFER)
393 
394 struct tu_event
395 {
396    struct vk_object_base base;
397    struct tu_bo *bo;
398 };
399 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
400 
401 struct tu_sampler {
402    struct vk_object_base base;
403 
404    uint32_t descriptor[A6XX_TEX_SAMP_DWORDS];
405    struct tu_sampler_ycbcr_conversion *ycbcr_sampler;
406 };
407 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, base, VkSampler,
408                                VK_OBJECT_TYPE_SAMPLER)
409 
410 uint64_t
411 tu_get_system_heap_size(void);
412 
413 const char *
414 tu_get_debug_option_name(int id);
415 
416 VkResult
417 tu_physical_device_init(struct tu_physical_device *device,
418                         struct tu_instance *instance);
419 
420 uint64_t
421 tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
422 
423 static inline struct tu_bo *
tu_device_lookup_bo(struct tu_device * device,uint32_t handle)424 tu_device_lookup_bo(struct tu_device *device, uint32_t handle)
425 {
426    return (struct tu_bo *) util_sparse_array_get(&device->bo_map, handle);
427 }
428 
429 /* Get a scratch bo for use inside a command buffer. This will always return
430  * the same bo given the same size or similar sizes, so only one scratch bo
431  * can be used at the same time. It's meant for short-lived things where we
432  * need to write to some piece of memory, read from it, and then immediately
433  * discard it.
434  */
435 VkResult
436 tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo);
437 
438 void tu_setup_dynamic_framebuffer(struct tu_cmd_buffer *cmd_buffer,
439                                   const VkRenderingInfo *pRenderingInfo);
440 
441 void
442 tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream,
443                          void *ts_from, uint32_t from_offset,
444                          void *ts_to, uint32_t to_offset,
445                          uint32_t count);
446 
447 
448 VkResult
449 tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
450                             struct u_trace **trace_copy);
451 
452 /* If we copy trace and timestamps we will have to free them. */
453 struct tu_u_trace_cmd_data
454 {
455    struct tu_cs *timestamp_copy_cs;
456    struct u_trace *trace;
457 };
458 
459 /* Data necessary to retrieve timestamps and clean all
460  * associated resources afterwards.
461  */
462 struct tu_u_trace_submission_data
463 {
464    uint32_t submission_id;
465    /* We have to know when timestamps are available,
466     * this sync object indicates it.
467     */
468    struct tu_u_trace_syncobj *syncobj;
469 
470    uint32_t cmd_buffer_count;
471    uint32_t last_buffer_with_tracepoints;
472    struct tu_u_trace_cmd_data *cmd_trace_data;
473 };
474 
475 VkResult
476 tu_u_trace_submission_data_create(
477    struct tu_device *device,
478    struct tu_cmd_buffer **cmd_buffers,
479    uint32_t cmd_buffer_count,
480    struct tu_u_trace_submission_data **submission_data);
481 
482 void
483 tu_u_trace_submission_data_finish(
484    struct tu_device *device,
485    struct tu_u_trace_submission_data *submission_data);
486 
487 #endif /* TU_DEVICE_H */
488