1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 * SPDX-License-Identifier: MIT
5 *
6 * based in part on anv driver which is:
7 * Copyright © 2015 Intel Corporation
8 */
9
10 #ifndef TU_DEVICE_H
11 #define TU_DEVICE_H
12
13 #include "tu_common.h"
14
15 #include "tu_autotune.h"
16 #include "tu_pass.h"
17 #include "tu_perfetto.h"
18 #include "tu_suballoc.h"
19 #include "tu_util.h"
20
21 /* queue types */
22 #define TU_QUEUE_GENERAL 0
23
24 #define TU_MAX_QUEUE_FAMILIES 1
25
26 #define TU_BORDER_COLOR_COUNT 4096
27 #define TU_BORDER_COLOR_BUILTIN 6
28
29 #define TU_BLIT_SHADER_SIZE 1024
30
31 /* extra space in vsc draw/prim streams */
32 #define VSC_PAD 0x40
33
34 enum tu_debug_flags
35 {
36 TU_DEBUG_STARTUP = 1 << 0,
37 TU_DEBUG_NIR = 1 << 1,
38 TU_DEBUG_NOBIN = 1 << 3,
39 TU_DEBUG_SYSMEM = 1 << 4,
40 TU_DEBUG_FORCEBIN = 1 << 5,
41 TU_DEBUG_NOUBWC = 1 << 6,
42 TU_DEBUG_NOMULTIPOS = 1 << 7,
43 TU_DEBUG_NOLRZ = 1 << 8,
44 TU_DEBUG_PERFC = 1 << 9,
45 TU_DEBUG_FLUSHALL = 1 << 10,
46 TU_DEBUG_SYNCDRAW = 1 << 11,
47 TU_DEBUG_DONT_CARE_AS_LOAD = 1 << 12,
48 TU_DEBUG_GMEM = 1 << 13,
49 TU_DEBUG_RAST_ORDER = 1 << 14,
50 TU_DEBUG_UNALIGNED_STORE = 1 << 15,
51 TU_DEBUG_LAYOUT = 1 << 16,
52 TU_DEBUG_LOG_SKIP_GMEM_OPS = 1 << 17,
53 TU_DEBUG_PERF = 1 << 18,
54 TU_DEBUG_NOLRZFC = 1 << 19,
55 TU_DEBUG_DYNAMIC = 1 << 20,
56 };
57
58 enum global_shader {
59 GLOBAL_SH_VS_BLIT,
60 GLOBAL_SH_VS_CLEAR,
61 GLOBAL_SH_FS_BLIT,
62 GLOBAL_SH_FS_BLIT_ZSCALE,
63 GLOBAL_SH_FS_COPY_MS,
64 GLOBAL_SH_FS_CLEAR0,
65 GLOBAL_SH_FS_CLEAR_MAX = GLOBAL_SH_FS_CLEAR0 + MAX_RTS,
66 GLOBAL_SH_COUNT,
67 };
68
69 struct tu_memory_heap {
70 /* Standard bits passed on to the client */
71 VkDeviceSize size;
72 VkMemoryHeapFlags flags;
73
74 /** Copied from ANV:
75 *
76 * Driver-internal book-keeping.
77 *
78 * Align it to 64 bits to make atomic operations faster on 32 bit platforms.
79 */
80 VkDeviceSize used __attribute__ ((aligned (8)));
81 };
82
83 struct tu_physical_device
84 {
85 struct vk_physical_device vk;
86
87 struct tu_instance *instance;
88
89 const char *name;
90 uint8_t driver_uuid[VK_UUID_SIZE];
91 uint8_t device_uuid[VK_UUID_SIZE];
92 uint8_t cache_uuid[VK_UUID_SIZE];
93
94 struct wsi_device wsi_device;
95
96 int local_fd;
97 bool has_local;
98 int64_t local_major;
99 int64_t local_minor;
100 int master_fd;
101 bool has_master;
102 int64_t master_major;
103 int64_t master_minor;
104
105 uint32_t gmem_size;
106 uint64_t gmem_base;
107 uint32_t ccu_offset_gmem;
108 uint32_t ccu_offset_bypass;
109
110 struct fd_dev_id dev_id;
111 const struct fd_dev_info *info;
112
113 int msm_major_version;
114 int msm_minor_version;
115
116 /* Address space and global fault count for this local_fd with DRM backend */
117 uint64_t fault_count;
118
119 struct tu_memory_heap heap;
120
121 struct vk_sync_type syncobj_type;
122 struct vk_sync_timeline_type timeline_type;
123 const struct vk_sync_type *sync_types[3];
124 };
125 VK_DEFINE_HANDLE_CASTS(tu_physical_device, vk.base, VkPhysicalDevice,
126 VK_OBJECT_TYPE_PHYSICAL_DEVICE)
127
128 struct tu_instance
129 {
130 struct vk_instance vk;
131
132 uint32_t api_version;
133 int physical_device_count;
134 struct tu_physical_device physical_devices[TU_MAX_DRM_DEVICES];
135
136 struct driOptionCache dri_options;
137 struct driOptionCache available_dri_options;
138
139 enum tu_debug_flags debug_flags;
140 };
141 VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
142 VK_OBJECT_TYPE_INSTANCE)
143
144 struct tu_queue
145 {
146 struct vk_queue vk;
147
148 struct tu_device *device;
149
150 uint32_t msm_queue_id;
151 int fence;
152 };
153 VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
154
155 /* This struct defines the layout of the global_bo */
156 struct tu6_global
157 {
158 /* clear/blit shaders */
159 uint32_t shaders[TU_BLIT_SHADER_SIZE];
160
161 uint32_t seqno_dummy; /* dummy seqno for CP_EVENT_WRITE */
162 uint32_t _pad0;
163 volatile uint32_t vsc_draw_overflow;
164 uint32_t _pad1;
165 volatile uint32_t vsc_prim_overflow;
166 uint32_t _pad2;
167 uint64_t predicate;
168
169 /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */
170 struct {
171 uint32_t offset;
172 uint32_t pad[7];
173 } flush_base[4];
174
175 ALIGN16 uint32_t cs_indirect_xyz[3];
176
177 volatile uint32_t vtx_stats_query_not_running;
178
179 /* To know when renderpass stats for autotune are valid */
180 volatile uint32_t autotune_fence;
181
182 /* For recycling command buffers for dynamic suspend/resume comamnds */
183 volatile uint32_t dynamic_rendering_fence;
184
185 volatile uint32_t dbg_one;
186 volatile uint32_t dbg_gmem_total_loads;
187 volatile uint32_t dbg_gmem_taken_loads;
188 volatile uint32_t dbg_gmem_total_stores;
189 volatile uint32_t dbg_gmem_taken_stores;
190
191 /* Written from GPU */
192 volatile uint32_t breadcrumb_gpu_sync_seqno;
193 uint32_t _pad3;
194 /* Written from CPU, acknowledges value written from GPU */
195 volatile uint32_t breadcrumb_cpu_sync_seqno;
196 uint32_t _pad4;
197
198 /* note: larger global bo will be used for customBorderColors */
199 struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[];
200 };
201 #define gb_offset(member) offsetof(struct tu6_global, member)
202 #define global_iova(cmd, member) ((cmd)->device->global_bo->iova + gb_offset(member))
203
204 struct tu_device
205 {
206 struct vk_device vk;
207 struct tu_instance *instance;
208
209 struct tu_queue *queues[TU_MAX_QUEUE_FAMILIES];
210 int queue_count[TU_MAX_QUEUE_FAMILIES];
211
212 struct tu_physical_device *physical_device;
213 int fd;
214
215 struct ir3_compiler *compiler;
216
217 /* Backup in-memory cache to be used if the app doesn't provide one */
218 struct vk_pipeline_cache *mem_cache;
219
220 #define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */
221
222 /* Currently the kernel driver uses a 32-bit GPU address space, but it
223 * should be impossible to go beyond 48 bits.
224 */
225 struct {
226 struct tu_bo *bo;
227 mtx_t construct_mtx;
228 bool initialized;
229 } scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2];
230
231 struct tu_bo *global_bo;
232
233 uint32_t implicit_sync_bo_count;
234
235 /* Device-global BO suballocator for reducing BO management overhead for
236 * (read-only) pipeline state. Synchronized by pipeline_mutex.
237 */
238 struct tu_suballocator pipeline_suballoc;
239 mtx_t pipeline_mutex;
240
241 /* Device-global BO suballocator for reducing BO management for small
242 * gmem/sysmem autotune result buffers. Synchronized by autotune_mutex.
243 */
244 struct tu_suballocator autotune_suballoc;
245 mtx_t autotune_mutex;
246
247 /* the blob seems to always use 8K factor and 128K param sizes, copy them */
248 #define TU_TESS_FACTOR_SIZE (8 * 1024)
249 #define TU_TESS_PARAM_SIZE (128 * 1024)
250 #define TU_TESS_BO_SIZE (TU_TESS_FACTOR_SIZE + TU_TESS_PARAM_SIZE)
251 /* Lazily allocated, protected by the device mutex. */
252 struct tu_bo *tess_bo;
253
254 struct ir3_shader_variant *global_shader_variants[GLOBAL_SH_COUNT];
255 struct ir3_shader *global_shaders[GLOBAL_SH_COUNT];
256 uint64_t global_shader_va[GLOBAL_SH_COUNT];
257
258 uint32_t vsc_draw_strm_pitch;
259 uint32_t vsc_prim_strm_pitch;
260 BITSET_DECLARE(custom_border_color, TU_BORDER_COLOR_COUNT);
261 mtx_t mutex;
262
263 /* bo list for submits: */
264 struct drm_msm_gem_submit_bo *bo_list;
265 /* map bo handles to bo list index: */
266 uint32_t bo_count, bo_list_size;
267 mtx_t bo_mutex;
268 /* protects imported BOs creation/freeing */
269 struct u_rwlock dma_bo_lock;
270
271 /* This array holds all our 'struct tu_bo' allocations. We use this
272 * so we can add a refcount to our BOs and check if a particular BO
273 * was already allocated in this device using its GEM handle. This is
274 * necessary to properly manage BO imports, because the kernel doesn't
275 * refcount the underlying BO memory.
276 *
277 * Specifically, when self-importing (i.e. importing a BO into the same
278 * device that created it), the kernel will give us the same BO handle
279 * for both BOs and we must only free it once when both references are
280 * freed. Otherwise, if we are not self-importing, we get two different BO
281 * handles, and we want to free each one individually.
282 *
283 * The refcount is also useful for being able to maintain BOs across
284 * VK object lifetimes, such as pipelines suballocating out of BOs
285 * allocated on the device.
286 */
287 struct util_sparse_array bo_map;
288
289 /* Command streams to set pass index to a scratch reg */
290 struct tu_cs *perfcntrs_pass_cs;
291 struct tu_cs_entry *perfcntrs_pass_cs_entries;
292
293 struct util_dynarray dynamic_rendering_pending;
294 VkCommandPool dynamic_rendering_pool;
295 uint32_t dynamic_rendering_fence;
296
297 /* Condition variable for timeline semaphore to notify waiters when a
298 * new submit is executed. */
299 pthread_cond_t timeline_cond;
300 pthread_mutex_t submit_mutex;
301
302 struct tu_autotune autotune;
303
304 struct breadcrumbs_context *breadcrumbs_ctx;
305
306 #ifdef ANDROID
307 const void *gralloc;
308 enum {
309 TU_GRALLOC_UNKNOWN,
310 TU_GRALLOC_CROS,
311 TU_GRALLOC_OTHER,
312 } gralloc_type;
313 #endif
314
315 uint32_t submit_count;
316
317 struct u_trace_context trace_context;
318
319 #ifdef HAVE_PERFETTO
320 struct tu_perfetto_state perfetto;
321 #endif
322
323 bool use_z24uint_s8uint;
324 };
325 VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
326
327 struct tu_device_memory
328 {
329 struct vk_object_base base;
330
331 struct tu_bo *bo;
332 };
333 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, base, VkDeviceMemory,
334 VK_OBJECT_TYPE_DEVICE_MEMORY)
335
336 struct tu_buffer
337 {
338 struct vk_object_base base;
339
340 VkDeviceSize size;
341
342 VkBufferUsageFlags usage;
343 VkBufferCreateFlags flags;
344
345 struct tu_bo *bo;
346 uint64_t iova;
347 };
348 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, base, VkBuffer,
349 VK_OBJECT_TYPE_BUFFER)
350
351 struct tu_attachment_info
352 {
353 struct tu_image_view *attachment;
354 };
355
356 struct tu_tiling_config {
357 /* size of the first tile */
358 VkExtent2D tile0;
359 /* number of tiles */
360 VkExtent2D tile_count;
361
362 /* size of the first VSC pipe */
363 VkExtent2D pipe0;
364 /* number of VSC pipes */
365 VkExtent2D pipe_count;
366
367 /* Whether binning should be used for gmem rendering using this framebuffer. */
368 bool binning;
369
370 /* Whether binning could be used for gmem rendering using this framebuffer. */
371 bool binning_possible;
372
373 /* pipe register values */
374 uint32_t pipe_config[MAX_VSC_PIPES];
375 uint32_t pipe_sizes[MAX_VSC_PIPES];
376 };
377
378 struct tu_framebuffer
379 {
380 struct vk_object_base base;
381
382 uint32_t width;
383 uint32_t height;
384 uint32_t layers;
385
386 struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT];
387
388 uint32_t attachment_count;
389 struct tu_attachment_info attachments[0];
390 };
391 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, base, VkFramebuffer,
392 VK_OBJECT_TYPE_FRAMEBUFFER)
393
394 struct tu_event
395 {
396 struct vk_object_base base;
397 struct tu_bo *bo;
398 };
399 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
400
401 struct tu_sampler {
402 struct vk_object_base base;
403
404 uint32_t descriptor[A6XX_TEX_SAMP_DWORDS];
405 struct tu_sampler_ycbcr_conversion *ycbcr_sampler;
406 };
407 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, base, VkSampler,
408 VK_OBJECT_TYPE_SAMPLER)
409
410 uint64_t
411 tu_get_system_heap_size(void);
412
413 const char *
414 tu_get_debug_option_name(int id);
415
416 VkResult
417 tu_physical_device_init(struct tu_physical_device *device,
418 struct tu_instance *instance);
419
420 uint64_t
421 tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
422
423 static inline struct tu_bo *
tu_device_lookup_bo(struct tu_device * device,uint32_t handle)424 tu_device_lookup_bo(struct tu_device *device, uint32_t handle)
425 {
426 return (struct tu_bo *) util_sparse_array_get(&device->bo_map, handle);
427 }
428
429 /* Get a scratch bo for use inside a command buffer. This will always return
430 * the same bo given the same size or similar sizes, so only one scratch bo
431 * can be used at the same time. It's meant for short-lived things where we
432 * need to write to some piece of memory, read from it, and then immediately
433 * discard it.
434 */
435 VkResult
436 tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo);
437
438 void tu_setup_dynamic_framebuffer(struct tu_cmd_buffer *cmd_buffer,
439 const VkRenderingInfo *pRenderingInfo);
440
441 void
442 tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream,
443 void *ts_from, uint32_t from_offset,
444 void *ts_to, uint32_t to_offset,
445 uint32_t count);
446
447
448 VkResult
449 tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
450 struct u_trace **trace_copy);
451
452 /* If we copy trace and timestamps we will have to free them. */
453 struct tu_u_trace_cmd_data
454 {
455 struct tu_cs *timestamp_copy_cs;
456 struct u_trace *trace;
457 };
458
459 /* Data necessary to retrieve timestamps and clean all
460 * associated resources afterwards.
461 */
462 struct tu_u_trace_submission_data
463 {
464 uint32_t submission_id;
465 /* We have to know when timestamps are available,
466 * this sync object indicates it.
467 */
468 struct tu_u_trace_syncobj *syncobj;
469
470 uint32_t cmd_buffer_count;
471 uint32_t last_buffer_with_tracepoints;
472 struct tu_u_trace_cmd_data *cmd_trace_data;
473 };
474
475 VkResult
476 tu_u_trace_submission_data_create(
477 struct tu_device *device,
478 struct tu_cmd_buffer **cmd_buffers,
479 uint32_t cmd_buffer_count,
480 struct tu_u_trace_submission_data **submission_data);
481
482 void
483 tu_u_trace_submission_data_finish(
484 struct tu_device *device,
485 struct tu_u_trace_submission_data *submission_data);
486
487 #endif /* TU_DEVICE_H */
488