1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 * DEALINGS IN THE SOFTWARE.
26 */
27
28 #ifndef TU_PRIVATE_H
29 #define TU_PRIVATE_H
30
31 #include <assert.h>
32 #include <pthread.h>
33 #include <stdbool.h>
34 #include <stdint.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #ifdef HAVE_VALGRIND
39 #include <memcheck.h>
40 #include <valgrind.h>
41 #define VG(x) x
42 #else
43 #define VG(x) ((void)0)
44 #endif
45
46 #define MESA_LOG_TAG "TU"
47
48 #include "c11/threads.h"
49 #include "main/macros.h"
50 #include "util/bitscan.h"
51 #include "util/list.h"
52 #include "util/log.h"
53 #include "util/macros.h"
54 #include "util/u_atomic.h"
55 #include "util/u_dynarray.h"
56 #include "util/perf/u_trace.h"
57 #include "vk_alloc.h"
58 #include "vk_debug_report.h"
59 #include "vk_device.h"
60 #include "vk_dispatch_table.h"
61 #include "vk_extensions.h"
62 #include "vk_instance.h"
63 #include "vk_log.h"
64 #include "vk_physical_device.h"
65 #include "vk_shader_module.h"
66 #include "wsi_common.h"
67
68 #include "ir3/ir3_compiler.h"
69 #include "ir3/ir3_shader.h"
70
71 #include "adreno_common.xml.h"
72 #include "adreno_pm4.xml.h"
73 #include "a6xx.xml.h"
74 #include "fdl/freedreno_layout.h"
75 #include "common/freedreno_dev_info.h"
76 #include "perfcntrs/freedreno_perfcntr.h"
77
78 #include "tu_descriptor_set.h"
79 #include "tu_util.h"
80 #include "tu_perfetto.h"
81
82 /* Pre-declarations needed for WSI entrypoints */
83 struct wl_surface;
84 struct wl_display;
85 typedef struct xcb_connection_t xcb_connection_t;
86 typedef uint32_t xcb_visualid_t;
87 typedef uint32_t xcb_window_t;
88
89 #include <vulkan/vk_android_native_buffer.h>
90 #include <vulkan/vk_icd.h>
91 #include <vulkan/vulkan.h>
92
93 #include "tu_entrypoints.h"
94
95 #include "vk_format.h"
96 #include "vk_command_buffer.h"
97 #include "vk_queue.h"
98
99 #define MAX_VBS 32
100 #define MAX_VERTEX_ATTRIBS 32
101 #define MAX_RTS 8
102 #define MAX_VSC_PIPES 32
103 #define MAX_VIEWPORTS 16
104 #define MAX_VIEWPORT_SIZE (1 << 14)
105 #define MAX_SCISSORS 16
106 #define MAX_DISCARD_RECTANGLES 4
107 #define MAX_PUSH_CONSTANTS_SIZE 128
108 #define MAX_PUSH_DESCRIPTORS 32
109 #define MAX_DYNAMIC_UNIFORM_BUFFERS 16
110 #define MAX_DYNAMIC_STORAGE_BUFFERS 8
111 #define MAX_DYNAMIC_BUFFERS \
112 (MAX_DYNAMIC_UNIFORM_BUFFERS + MAX_DYNAMIC_STORAGE_BUFFERS)
113 #define TU_MAX_DRM_DEVICES 8
114 #define MAX_VIEWS 16
115 #define MAX_BIND_POINTS 2 /* compute + graphics */
116 /* The Qualcomm driver exposes 0x20000058 */
117 #define MAX_STORAGE_BUFFER_RANGE 0x20000000
118 /* We use ldc for uniform buffer loads, just like the Qualcomm driver, so
119 * expose the same maximum range.
120 * TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual
121 * range might be higher.
122 */
123 #define MAX_UNIFORM_BUFFER_RANGE 0x10000
124
125 #define A6XX_TEX_CONST_DWORDS 16
126 #define A6XX_TEX_SAMP_DWORDS 4
127
128 #define COND(bool, val) ((bool) ? (val) : 0)
129 #define BIT(bit) (1u << (bit))
130
131 /* Whenever we generate an error, pass it through this function. Useful for
132 * debugging, where we can break on it. Only call at error site, not when
133 * propagating errors. Might be useful to plug in a stack trace here.
134 */
135
136 struct tu_instance;
137
138 VkResult
139 __vk_startup_errorf(struct tu_instance *instance,
140 VkResult error,
141 bool force_print,
142 const char *file,
143 int line,
144 const char *format,
145 ...) PRINTFLIKE(6, 7);
146
147 /* Prints startup errors if TU_DEBUG=startup is set or on a debug driver
148 * build.
149 */
150 #define vk_startup_errorf(instance, error, format, ...) \
151 __vk_startup_errorf(instance, error, \
152 instance->debug_flags & TU_DEBUG_STARTUP, \
153 __FILE__, __LINE__, format, ##__VA_ARGS__)
154
155 void
156 __tu_finishme(const char *file, int line, const char *format, ...)
157 PRINTFLIKE(3, 4);
158
159 /**
160 * Print a FINISHME message, including its source location.
161 */
162 #define tu_finishme(format, ...) \
163 do { \
164 static bool reported = false; \
165 if (!reported) { \
166 __tu_finishme(__FILE__, __LINE__, format, ##__VA_ARGS__); \
167 reported = true; \
168 } \
169 } while (0)
170
171 #define tu_stub() \
172 do { \
173 tu_finishme("stub %s", __func__); \
174 } while (0)
175
176 struct tu_memory_heap {
177 /* Standard bits passed on to the client */
178 VkDeviceSize size;
179 VkMemoryHeapFlags flags;
180
181 /** Copied from ANV:
182 *
183 * Driver-internal book-keeping.
184 *
185 * Align it to 64 bits to make atomic operations faster on 32 bit platforms.
186 */
187 VkDeviceSize used __attribute__ ((aligned (8)));
188 };
189
190 uint64_t
191 tu_get_system_heap_size(void);
192
193 struct tu_physical_device
194 {
195 struct vk_physical_device vk;
196
197 struct tu_instance *instance;
198
199 const char *name;
200 uint8_t driver_uuid[VK_UUID_SIZE];
201 uint8_t device_uuid[VK_UUID_SIZE];
202 uint8_t cache_uuid[VK_UUID_SIZE];
203
204 struct wsi_device wsi_device;
205
206 int local_fd;
207 int master_fd;
208
209 uint32_t gmem_size;
210 uint64_t gmem_base;
211 uint32_t ccu_offset_gmem;
212 uint32_t ccu_offset_bypass;
213
214 struct fd_dev_id dev_id;
215 const struct fd_dev_info *info;
216
217 int msm_major_version;
218 int msm_minor_version;
219
220 /* This is the drivers on-disk cache used as a fallback as opposed to
221 * the pipeline cache defined by apps.
222 */
223 struct disk_cache *disk_cache;
224
225 struct tu_memory_heap heap;
226 };
227
228 enum tu_debug_flags
229 {
230 TU_DEBUG_STARTUP = 1 << 0,
231 TU_DEBUG_NIR = 1 << 1,
232 TU_DEBUG_NOBIN = 1 << 3,
233 TU_DEBUG_SYSMEM = 1 << 4,
234 TU_DEBUG_FORCEBIN = 1 << 5,
235 TU_DEBUG_NOUBWC = 1 << 6,
236 TU_DEBUG_NOMULTIPOS = 1 << 7,
237 TU_DEBUG_NOLRZ = 1 << 8,
238 TU_DEBUG_PERFC = 1 << 9,
239 TU_DEBUG_FLUSHALL = 1 << 10,
240 TU_DEBUG_SYNCDRAW = 1 << 11,
241 };
242
243 struct tu_instance
244 {
245 struct vk_instance vk;
246
247 uint32_t api_version;
248 int physical_device_count;
249 struct tu_physical_device physical_devices[TU_MAX_DRM_DEVICES];
250
251 enum tu_debug_flags debug_flags;
252 };
253
254 VkResult
255 tu_wsi_init(struct tu_physical_device *physical_device);
256 void
257 tu_wsi_finish(struct tu_physical_device *physical_device);
258
259 bool
260 tu_instance_extension_supported(const char *name);
261 uint32_t
262 tu_physical_device_api_version(struct tu_physical_device *dev);
263 bool
264 tu_physical_device_extension_supported(struct tu_physical_device *dev,
265 const char *name);
266
267 struct cache_entry;
268
269 struct tu_pipeline_cache
270 {
271 struct vk_object_base base;
272
273 struct tu_device *device;
274 pthread_mutex_t mutex;
275
276 uint32_t total_size;
277 uint32_t table_size;
278 uint32_t kernel_count;
279 struct cache_entry **hash_table;
280 bool modified;
281
282 VkAllocationCallbacks alloc;
283 };
284
285 struct tu_pipeline_key
286 {
287 };
288
289
290 /* queue types */
291 #define TU_QUEUE_GENERAL 0
292
293 #define TU_MAX_QUEUE_FAMILIES 1
294
295 struct tu_syncobj;
296 struct tu_u_trace_syncobj;
297
298 struct tu_queue
299 {
300 struct vk_queue vk;
301
302 struct tu_device *device;
303
304 uint32_t msm_queue_id;
305 int fence;
306
307 /* Queue containing deferred submits */
308 struct list_head queued_submits;
309 };
310
311 struct tu_bo
312 {
313 uint32_t gem_handle;
314 uint64_t size;
315 uint64_t iova;
316 void *map;
317 };
318
319 enum global_shader {
320 GLOBAL_SH_VS_BLIT,
321 GLOBAL_SH_VS_CLEAR,
322 GLOBAL_SH_FS_BLIT,
323 GLOBAL_SH_FS_BLIT_ZSCALE,
324 GLOBAL_SH_FS_COPY_MS,
325 GLOBAL_SH_FS_CLEAR0,
326 GLOBAL_SH_FS_CLEAR_MAX = GLOBAL_SH_FS_CLEAR0 + MAX_RTS,
327 GLOBAL_SH_COUNT,
328 };
329
330 #define TU_BORDER_COLOR_COUNT 4096
331 #define TU_BORDER_COLOR_BUILTIN 6
332
333 #define TU_BLIT_SHADER_SIZE 1024
334
335 /* This struct defines the layout of the global_bo */
336 struct tu6_global
337 {
338 /* clear/blit shaders */
339 uint32_t shaders[TU_BLIT_SHADER_SIZE];
340
341 uint32_t seqno_dummy; /* dummy seqno for CP_EVENT_WRITE */
342 uint32_t _pad0;
343 volatile uint32_t vsc_draw_overflow;
344 uint32_t _pad1;
345 volatile uint32_t vsc_prim_overflow;
346 uint32_t _pad2;
347 uint64_t predicate;
348
349 /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */
350 struct {
351 uint32_t offset;
352 uint32_t pad[7];
353 } flush_base[4];
354
355 ALIGN16 uint32_t cs_indirect_xyz[3];
356
357 /* note: larger global bo will be used for customBorderColors */
358 struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[];
359 };
360 #define gb_offset(member) offsetof(struct tu6_global, member)
361 #define global_iova(cmd, member) ((cmd)->device->global_bo.iova + gb_offset(member))
362
363 /* extra space in vsc draw/prim streams */
364 #define VSC_PAD 0x40
365
366 struct tu_device
367 {
368 struct vk_device vk;
369 struct tu_instance *instance;
370
371 struct tu_queue *queues[TU_MAX_QUEUE_FAMILIES];
372 int queue_count[TU_MAX_QUEUE_FAMILIES];
373
374 struct tu_physical_device *physical_device;
375 int fd;
376 int _lost;
377
378 struct ir3_compiler *compiler;
379
380 /* Backup in-memory cache to be used if the app doesn't provide one */
381 struct tu_pipeline_cache *mem_cache;
382
383 #define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */
384
385 /* Currently the kernel driver uses a 32-bit GPU address space, but it
386 * should be impossible to go beyond 48 bits.
387 */
388 struct {
389 struct tu_bo bo;
390 mtx_t construct_mtx;
391 bool initialized;
392 } scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2];
393
394 struct tu_bo global_bo;
395
396 struct ir3_shader_variant *global_shaders[GLOBAL_SH_COUNT];
397 uint64_t global_shader_va[GLOBAL_SH_COUNT];
398
399 uint32_t vsc_draw_strm_pitch;
400 uint32_t vsc_prim_strm_pitch;
401 BITSET_DECLARE(custom_border_color, TU_BORDER_COLOR_COUNT);
402 mtx_t mutex;
403
404 /* bo list for submits: */
405 struct drm_msm_gem_submit_bo *bo_list;
406 /* map bo handles to bo list index: */
407 uint32_t *bo_idx;
408 uint32_t bo_count, bo_list_size, bo_idx_size;
409 mtx_t bo_mutex;
410
411 /* Command streams to set pass index to a scratch reg */
412 struct tu_cs *perfcntrs_pass_cs;
413 struct tu_cs_entry *perfcntrs_pass_cs_entries;
414
415 /* Condition variable for timeline semaphore to notify waiters when a
416 * new submit is executed. */
417 pthread_cond_t timeline_cond;
418 pthread_mutex_t submit_mutex;
419
420 #ifdef ANDROID
421 const void *gralloc;
422 enum {
423 TU_GRALLOC_UNKNOWN,
424 TU_GRALLOC_CROS,
425 TU_GRALLOC_OTHER,
426 } gralloc_type;
427 #endif
428
429 uint32_t submit_count;
430
431 struct u_trace_context trace_context;
432
433 #ifdef HAVE_PERFETTO
434 struct tu_perfetto_state perfetto;
435 #endif
436 };
437
438 void tu_init_clear_blit_shaders(struct tu_device *dev);
439
440 void tu_destroy_clear_blit_shaders(struct tu_device *dev);
441
442 VkResult _tu_device_set_lost(struct tu_device *device,
443 const char *msg, ...) PRINTFLIKE(2, 3);
444 #define tu_device_set_lost(dev, ...) \
445 _tu_device_set_lost(dev, __VA_ARGS__)
446
447 static inline bool
tu_device_is_lost(struct tu_device * device)448 tu_device_is_lost(struct tu_device *device)
449 {
450 return unlikely(p_atomic_read(&device->_lost));
451 }
452
453 VkResult
454 tu_device_submit_deferred_locked(struct tu_device *dev);
455
456 VkResult
457 tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj);
458
459 uint64_t
460 tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
461
462 enum tu_bo_alloc_flags
463 {
464 TU_BO_ALLOC_NO_FLAGS = 0,
465 TU_BO_ALLOC_ALLOW_DUMP = 1 << 0,
466 TU_BO_ALLOC_GPU_READ_ONLY = 1 << 1,
467 };
468
469 VkResult
470 tu_bo_init_new(struct tu_device *dev, struct tu_bo *bo, uint64_t size,
471 enum tu_bo_alloc_flags flags);
472 VkResult
473 tu_bo_init_dmabuf(struct tu_device *dev,
474 struct tu_bo *bo,
475 uint64_t size,
476 int fd);
477 int
478 tu_bo_export_dmabuf(struct tu_device *dev, struct tu_bo *bo);
479 void
480 tu_bo_finish(struct tu_device *dev, struct tu_bo *bo);
481 VkResult
482 tu_bo_map(struct tu_device *dev, struct tu_bo *bo);
483
484 /* Get a scratch bo for use inside a command buffer. This will always return
485 * the same bo given the same size or similar sizes, so only one scratch bo
486 * can be used at the same time. It's meant for short-lived things where we
487 * need to write to some piece of memory, read from it, and then immediately
488 * discard it.
489 */
490 VkResult
491 tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo);
492
493 struct tu_cs_entry
494 {
495 /* No ownership */
496 const struct tu_bo *bo;
497
498 uint32_t size;
499 uint32_t offset;
500 };
501
502 struct tu_cs_memory {
503 uint32_t *map;
504 uint64_t iova;
505 };
506
507 struct tu_draw_state {
508 uint64_t iova : 48;
509 uint32_t size : 16;
510 };
511
512 enum tu_dynamic_state
513 {
514 /* re-use VK_DYNAMIC_STATE_ enums for non-extended dynamic states */
515 TU_DYNAMIC_STATE_SAMPLE_LOCATIONS = VK_DYNAMIC_STATE_STENCIL_REFERENCE + 1,
516 TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
517 TU_DYNAMIC_STATE_RB_STENCIL_CNTL,
518 TU_DYNAMIC_STATE_VB_STRIDE,
519 TU_DYNAMIC_STATE_RASTERIZER_DISCARD,
520 TU_DYNAMIC_STATE_COUNT,
521 /* no associated draw state: */
522 TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY = TU_DYNAMIC_STATE_COUNT,
523 TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE,
524 /* re-use the line width enum as it uses GRAS_SU_CNTL: */
525 TU_DYNAMIC_STATE_GRAS_SU_CNTL = VK_DYNAMIC_STATE_LINE_WIDTH,
526 };
527
528 enum tu_draw_state_group_id
529 {
530 TU_DRAW_STATE_PROGRAM_CONFIG,
531 TU_DRAW_STATE_PROGRAM,
532 TU_DRAW_STATE_PROGRAM_BINNING,
533 TU_DRAW_STATE_TESS,
534 TU_DRAW_STATE_VB,
535 TU_DRAW_STATE_VI,
536 TU_DRAW_STATE_VI_BINNING,
537 TU_DRAW_STATE_RAST,
538 TU_DRAW_STATE_BLEND,
539 TU_DRAW_STATE_SHADER_GEOM_CONST,
540 TU_DRAW_STATE_FS_CONST,
541 TU_DRAW_STATE_DESC_SETS,
542 TU_DRAW_STATE_DESC_SETS_LOAD,
543 TU_DRAW_STATE_VS_PARAMS,
544 TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
545 TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
546 TU_DRAW_STATE_LRZ,
547 TU_DRAW_STATE_DEPTH_PLANE,
548
549 /* dynamic state related draw states */
550 TU_DRAW_STATE_DYNAMIC,
551 TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
552 };
553
554 enum tu_cs_mode
555 {
556
557 /*
558 * A command stream in TU_CS_MODE_GROW mode grows automatically whenever it
559 * is full. tu_cs_begin must be called before command packet emission and
560 * tu_cs_end must be called after.
561 *
562 * This mode may create multiple entries internally. The entries must be
563 * submitted together.
564 */
565 TU_CS_MODE_GROW,
566
567 /*
568 * A command stream in TU_CS_MODE_EXTERNAL mode wraps an external,
569 * fixed-size buffer. tu_cs_begin and tu_cs_end are optional and have no
570 * effect on it.
571 *
572 * This mode does not create any entry or any BO.
573 */
574 TU_CS_MODE_EXTERNAL,
575
576 /*
577 * A command stream in TU_CS_MODE_SUB_STREAM mode does not support direct
578 * command packet emission. tu_cs_begin_sub_stream must be called to get a
579 * sub-stream to emit comamnd packets to. When done with the sub-stream,
580 * tu_cs_end_sub_stream must be called.
581 *
582 * This mode does not create any entry internally.
583 */
584 TU_CS_MODE_SUB_STREAM,
585 };
586
587 struct tu_cs
588 {
589 uint32_t *start;
590 uint32_t *cur;
591 uint32_t *reserved_end;
592 uint32_t *end;
593
594 struct tu_device *device;
595 enum tu_cs_mode mode;
596 uint32_t next_bo_size;
597
598 struct tu_cs_entry *entries;
599 uint32_t entry_count;
600 uint32_t entry_capacity;
601
602 struct tu_bo **bos;
603 uint32_t bo_count;
604 uint32_t bo_capacity;
605
606 /* state for cond_exec_start/cond_exec_end */
607 uint32_t cond_flags;
608 uint32_t *cond_dwords;
609 };
610
611 struct tu_device_memory
612 {
613 struct vk_object_base base;
614
615 struct tu_bo bo;
616 };
617
618 struct tu_descriptor_range
619 {
620 uint64_t va;
621 uint32_t size;
622 };
623
624 struct tu_descriptor_set
625 {
626 struct vk_object_base base;
627
628 const struct tu_descriptor_set_layout *layout;
629 struct tu_descriptor_pool *pool;
630 uint32_t size;
631
632 uint64_t va;
633 uint32_t *mapped_ptr;
634
635 uint32_t *dynamic_descriptors;
636 };
637
638 struct tu_descriptor_pool_entry
639 {
640 uint32_t offset;
641 uint32_t size;
642 struct tu_descriptor_set *set;
643 };
644
645 struct tu_descriptor_pool
646 {
647 struct vk_object_base base;
648
649 struct tu_bo bo;
650 uint64_t current_offset;
651 uint64_t size;
652
653 uint8_t *host_memory_base;
654 uint8_t *host_memory_ptr;
655 uint8_t *host_memory_end;
656 uint8_t *host_bo;
657
658 uint32_t entry_count;
659 uint32_t max_entry_count;
660 struct tu_descriptor_pool_entry entries[0];
661 };
662
663 struct tu_descriptor_update_template_entry
664 {
665 VkDescriptorType descriptor_type;
666
667 /* The number of descriptors to update */
668 uint32_t descriptor_count;
669
670 /* Into mapped_ptr or dynamic_descriptors, in units of the respective array
671 */
672 uint32_t dst_offset;
673
674 /* In dwords. Not valid/used for dynamic descriptors */
675 uint32_t dst_stride;
676
677 uint32_t buffer_offset;
678
679 /* Only valid for combined image samplers and samplers */
680 uint16_t has_sampler;
681
682 /* In bytes */
683 size_t src_offset;
684 size_t src_stride;
685
686 /* For push descriptors */
687 const struct tu_sampler *immutable_samplers;
688 };
689
690 struct tu_descriptor_update_template
691 {
692 struct vk_object_base base;
693
694 uint32_t entry_count;
695 VkPipelineBindPoint bind_point;
696 struct tu_descriptor_update_template_entry entry[0];
697 };
698
699 struct tu_buffer
700 {
701 struct vk_object_base base;
702
703 VkDeviceSize size;
704
705 VkBufferUsageFlags usage;
706 VkBufferCreateFlags flags;
707
708 struct tu_bo *bo;
709 VkDeviceSize bo_offset;
710 };
711
712 static inline uint64_t
tu_buffer_iova(struct tu_buffer * buffer)713 tu_buffer_iova(struct tu_buffer *buffer)
714 {
715 return buffer->bo->iova + buffer->bo_offset;
716 }
717
718 const char *
719 tu_get_debug_option_name(int id);
720
721 const char *
722 tu_get_perftest_option_name(int id);
723
724 struct tu_descriptor_state
725 {
726 struct tu_descriptor_set *sets[MAX_SETS];
727 struct tu_descriptor_set push_set;
728 uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS * A6XX_TEX_CONST_DWORDS];
729 };
730
731 enum tu_cmd_dirty_bits
732 {
733 TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
734 TU_CMD_DIRTY_VB_STRIDE = BIT(1),
735 TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2),
736 TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3),
737 TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4),
738 TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5),
739 TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6),
740 TU_CMD_DIRTY_SHADER_CONSTS = BIT(7),
741 TU_CMD_DIRTY_LRZ = BIT(8),
742 TU_CMD_DIRTY_VS_PARAMS = BIT(9),
743 TU_CMD_DIRTY_RASTERIZER_DISCARD = BIT(10),
744 /* all draw states were disabled and need to be re-enabled: */
745 TU_CMD_DIRTY_DRAW_STATE = BIT(11)
746 };
747
748 /* There are only three cache domains we have to care about: the CCU, or
749 * color cache unit, which is used for color and depth/stencil attachments
750 * and copy/blit destinations, and is split conceptually into color and depth,
751 * and the universal cache or UCHE which is used for pretty much everything
752 * else, except for the CP (uncached) and host. We need to flush whenever data
753 * crosses these boundaries.
754 */
755
756 enum tu_cmd_access_mask {
757 TU_ACCESS_UCHE_READ = 1 << 0,
758 TU_ACCESS_UCHE_WRITE = 1 << 1,
759 TU_ACCESS_CCU_COLOR_READ = 1 << 2,
760 TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
761 TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
762 TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
763
764 /* Experiments have shown that while it's safe to avoid flushing the CCU
765 * after each blit/renderpass, it's not safe to assume that subsequent
766 * lookups with a different attachment state will hit unflushed cache
767 * entries. That is, the CCU needs to be flushed and possibly invalidated
768 * when accessing memory with a different attachment state. Writing to an
769 * attachment under the following conditions after clearing using the
770 * normal 2d engine path is known to have issues:
771 *
772 * - It isn't the 0'th layer.
773 * - There are more than one attachment, and this isn't the 0'th attachment
774 * (this seems to also depend on the cpp of the attachments).
775 *
776 * Our best guess is that the layer/MRT state is used when computing
777 * the location of a cache entry in CCU, to avoid conflicts. We assume that
778 * any access in a renderpass after or before an access by a transfer needs
779 * a flush/invalidate, and use the _INCOHERENT variants to represent access
780 * by a renderpass.
781 */
782 TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
783 TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
784 TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
785 TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
786
787 /* Accesses which bypasses any cache. e.g. writes via the host,
788 * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
789 */
790 TU_ACCESS_SYSMEM_READ = 1 << 10,
791 TU_ACCESS_SYSMEM_WRITE = 1 << 11,
792
793 /* Memory writes from the CP start in-order with draws and event writes,
794 * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
795 */
796 TU_ACCESS_CP_WRITE = 1 << 12,
797
798 TU_ACCESS_READ =
799 TU_ACCESS_UCHE_READ |
800 TU_ACCESS_CCU_COLOR_READ |
801 TU_ACCESS_CCU_DEPTH_READ |
802 TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
803 TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
804 TU_ACCESS_SYSMEM_READ,
805
806 TU_ACCESS_WRITE =
807 TU_ACCESS_UCHE_WRITE |
808 TU_ACCESS_CCU_COLOR_WRITE |
809 TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
810 TU_ACCESS_CCU_DEPTH_WRITE |
811 TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
812 TU_ACCESS_SYSMEM_WRITE |
813 TU_ACCESS_CP_WRITE,
814
815 TU_ACCESS_ALL =
816 TU_ACCESS_READ |
817 TU_ACCESS_WRITE,
818 };
819
820 /* Starting with a6xx, the pipeline is split into several "clusters" (really
821 * pipeline stages). Each stage has its own pair of register banks and can
822 * switch them independently, so that earlier stages can run ahead of later
823 * ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at
824 * the same time.
825 *
826 * As a result of this, we need to insert a WFI when an earlier stage depends
827 * on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any
828 * pending WFI's to complete before starting, and usually before reading
829 * indirect params even, so a WFI also acts as a full "pipeline stall".
830 *
831 * Note, the names of the stages come from CLUSTER_* in devcoredump. We
832 * include all the stages for completeness, even ones which do not read/write
833 * anything.
834 */
835
836 enum tu_stage {
837 /* This doesn't correspond to a cluster, but we need it for tracking
838 * indirect draw parameter reads etc.
839 */
840 TU_STAGE_CP,
841
842 /* - Fetch index buffer
843 * - Fetch vertex attributes, dispatch VS
844 */
845 TU_STAGE_FE,
846
847 /* Execute all geometry stages (VS thru GS) */
848 TU_STAGE_SP_VS,
849
850 /* Write to VPC, do primitive assembly. */
851 TU_STAGE_PC_VS,
852
853 /* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according
854 * to devcoredump so presumably this stage stalls for TU_STAGE_PS when
855 * early depth testing is enabled before dispatching fragments? However
856 * GRAS reads and writes LRZ directly.
857 */
858 TU_STAGE_GRAS,
859
860 /* Execute FS */
861 TU_STAGE_SP_PS,
862
863 /* - Fragment tests
864 * - Write color/depth
865 * - Streamout writes (???)
866 * - Varying interpolation (???)
867 */
868 TU_STAGE_PS,
869 };
870
871 enum tu_cmd_flush_bits {
872 TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0,
873 TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1,
874 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
875 TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
876 TU_CMD_FLAG_CACHE_FLUSH = 1 << 4,
877 TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
878 TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6,
879 TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7,
880 TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8,
881
882 TU_CMD_FLAG_ALL_FLUSH =
883 TU_CMD_FLAG_CCU_FLUSH_DEPTH |
884 TU_CMD_FLAG_CCU_FLUSH_COLOR |
885 TU_CMD_FLAG_CACHE_FLUSH |
886 /* Treat the CP as a sort of "cache" which may need to be "flushed" via
887 * waiting for writes to land with WAIT_FOR_MEM_WRITES.
888 */
889 TU_CMD_FLAG_WAIT_MEM_WRITES,
890
891 TU_CMD_FLAG_ALL_INVALIDATE =
892 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
893 TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
894 TU_CMD_FLAG_CACHE_INVALIDATE,
895 };
896
897 /* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
898 * heavy, involving a CCU cache flush/invalidate and a WFI in order to change
899 * which part of the gmem is used by the CCU. Here we keep track of what the
900 * state of the CCU.
901 */
902 enum tu_cmd_ccu_state {
903 TU_CMD_CCU_SYSMEM,
904 TU_CMD_CCU_GMEM,
905 TU_CMD_CCU_UNKNOWN,
906 };
907
908 struct tu_cache_state {
909 /* Caches which must be made available (flushed) eventually if there are
910 * any users outside that cache domain, and caches which must be
911 * invalidated eventually if there are any reads.
912 */
913 enum tu_cmd_flush_bits pending_flush_bits;
914 /* Pending flushes */
915 enum tu_cmd_flush_bits flush_bits;
916 };
917
918 enum tu_lrz_force_disable_mask {
919 TU_LRZ_FORCE_DISABLE_LRZ = 1 << 0,
920 TU_LRZ_FORCE_DISABLE_WRITE = 1 << 1,
921 };
922
923 enum tu_lrz_direction {
924 TU_LRZ_UNKNOWN,
925 /* Depth func less/less-than: */
926 TU_LRZ_LESS,
927 /* Depth func greater/greater-than: */
928 TU_LRZ_GREATER,
929 };
930
931 struct tu_lrz_pipeline
932 {
933 uint32_t force_disable_mask;
934 bool fs_has_kill;
935 bool force_late_z;
936 bool early_fragment_tests;
937 };
938
939 struct tu_lrz_state
940 {
941 /* Depth/Stencil image currently on use to do LRZ */
942 struct tu_image *image;
943 bool valid : 1;
944 struct tu_draw_state state;
945 enum tu_lrz_direction prev_direction;
946 };
947
948 struct tu_vs_params {
949 uint32_t vertex_offset;
950 uint32_t first_instance;
951 };
952
953 struct tu_cmd_state
954 {
955 uint32_t dirty;
956
957 struct tu_pipeline *pipeline;
958 struct tu_pipeline *compute_pipeline;
959
960 /* Vertex buffers, viewports, and scissors
961 * the states for these can be updated partially, so we need to save these
962 * to be able to emit a complete draw state
963 */
964 struct {
965 uint64_t base;
966 uint32_t size;
967 uint32_t stride;
968 } vb[MAX_VBS];
969 VkViewport viewport[MAX_VIEWPORTS];
970 VkRect2D scissor[MAX_SCISSORS];
971 uint32_t max_viewport, max_scissor;
972
973 /* for dynamic states that can't be emitted directly */
974 uint32_t dynamic_stencil_mask;
975 uint32_t dynamic_stencil_wrmask;
976 uint32_t dynamic_stencil_ref;
977
978 uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl;
979 uint32_t pc_raster_cntl, vpc_unknown_9107;
980 enum pc_di_primtype primtype;
981 bool primitive_restart_enable;
982
983 /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
984 struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
985 struct tu_draw_state vertex_buffers;
986 struct tu_draw_state shader_const[2];
987 struct tu_draw_state desc_sets;
988
989 struct tu_draw_state vs_params;
990
991 /* Index buffer */
992 uint64_t index_va;
993 uint32_t max_index_count;
994 uint8_t index_size;
995
996 /* because streamout base has to be 32-byte aligned
997 * there is an extra offset to deal with when it is
998 * unaligned
999 */
1000 uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
1001
1002 /* Renderpasses are tricky, because we may need to flush differently if
1003 * using sysmem vs. gmem and therefore we have to delay any flushing that
1004 * happens before a renderpass. So we have to have two copies of the flush
1005 * state, one for intra-renderpass flushes (i.e. renderpass dependencies)
1006 * and one for outside a renderpass.
1007 */
1008 struct tu_cache_state cache;
1009 struct tu_cache_state renderpass_cache;
1010
1011 enum tu_cmd_ccu_state ccu_state;
1012
1013 const struct tu_render_pass *pass;
1014 const struct tu_subpass *subpass;
1015 const struct tu_framebuffer *framebuffer;
1016 VkRect2D render_area;
1017
1018 const struct tu_image_view **attachments;
1019
1020 bool xfb_used;
1021 bool has_tess;
1022 bool has_subpass_predication;
1023 bool predication_active;
1024 bool disable_gmem;
1025 enum a5xx_line_mode line_mode;
1026
1027 struct tu_lrz_state lrz;
1028
1029 struct tu_draw_state depth_plane_state;
1030
1031 struct tu_vs_params last_vs_params;
1032 };
1033
1034 struct tu_cmd_pool
1035 {
1036 struct vk_object_base base;
1037
1038 VkAllocationCallbacks alloc;
1039 struct list_head cmd_buffers;
1040 struct list_head free_cmd_buffers;
1041 uint32_t queue_family_index;
1042 };
1043
1044 enum tu_cmd_buffer_status
1045 {
1046 TU_CMD_BUFFER_STATUS_INVALID,
1047 TU_CMD_BUFFER_STATUS_INITIAL,
1048 TU_CMD_BUFFER_STATUS_RECORDING,
1049 TU_CMD_BUFFER_STATUS_EXECUTABLE,
1050 TU_CMD_BUFFER_STATUS_PENDING,
1051 };
1052
1053 struct tu_cmd_buffer
1054 {
1055 struct vk_command_buffer vk;
1056
1057 struct tu_device *device;
1058
1059 struct tu_cmd_pool *pool;
1060 struct list_head pool_link;
1061
1062 struct u_trace trace;
1063 struct u_trace_iterator trace_renderpass_start;
1064 struct u_trace_iterator trace_renderpass_end;
1065
1066 VkCommandBufferUsageFlags usage_flags;
1067 VkCommandBufferLevel level;
1068 enum tu_cmd_buffer_status status;
1069
1070 struct tu_cmd_state state;
1071 uint32_t queue_family_index;
1072
1073 uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
1074 VkShaderStageFlags push_constant_stages;
1075 struct tu_descriptor_set meta_push_descriptors;
1076
1077 struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
1078
1079 VkResult record_result;
1080
1081 struct tu_cs cs;
1082 struct tu_cs draw_cs;
1083 struct tu_cs tile_store_cs;
1084 struct tu_cs draw_epilogue_cs;
1085 struct tu_cs sub_cs;
1086
1087 uint32_t vsc_draw_strm_pitch;
1088 uint32_t vsc_prim_strm_pitch;
1089 };
1090
1091 /* Temporary struct for tracking a register state to be written, used by
1092 * a6xx-pack.h and tu_cs_emit_regs()
1093 */
1094 struct tu_reg_value {
1095 uint32_t reg;
1096 uint64_t value;
1097 bool is_address;
1098 struct tu_bo *bo;
1099 bool bo_write;
1100 uint32_t bo_offset;
1101 uint32_t bo_shift;
1102 };
1103
1104
1105 void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
1106 struct tu_cs *cs);
1107
1108 void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
1109 struct tu_cs *cs,
1110 enum tu_cmd_ccu_state ccu_state);
1111
1112 void
1113 tu6_emit_event_write(struct tu_cmd_buffer *cmd,
1114 struct tu_cs *cs,
1115 enum vgt_event_type event);
1116
1117 static inline struct tu_descriptor_state *
tu_get_descriptors_state(struct tu_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)1118 tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
1119 VkPipelineBindPoint bind_point)
1120 {
1121 return &cmd_buffer->descriptors[bind_point];
1122 }
1123
1124 struct tu_event
1125 {
1126 struct vk_object_base base;
1127 struct tu_bo bo;
1128 };
1129
1130 struct tu_push_constant_range
1131 {
1132 uint32_t lo;
1133 uint32_t count;
1134 };
1135
1136 struct tu_shader
1137 {
1138 struct ir3_shader *ir3_shader;
1139
1140 struct tu_push_constant_range push_consts;
1141 uint8_t active_desc_sets;
1142 bool multi_pos_output;
1143 };
1144
1145 bool
1146 tu_nir_lower_multiview(nir_shader *nir, uint32_t mask, bool *multi_pos_output,
1147 struct tu_device *dev);
1148
1149 nir_shader *
1150 tu_spirv_to_nir(struct tu_device *dev,
1151 const VkPipelineShaderStageCreateInfo *stage_info,
1152 gl_shader_stage stage);
1153
1154 struct tu_shader *
1155 tu_shader_create(struct tu_device *dev,
1156 nir_shader *nir,
1157 unsigned multiview_mask,
1158 struct tu_pipeline_layout *layout,
1159 const VkAllocationCallbacks *alloc);
1160
1161 void
1162 tu_shader_destroy(struct tu_device *dev,
1163 struct tu_shader *shader,
1164 const VkAllocationCallbacks *alloc);
1165
1166 struct tu_program_descriptor_linkage
1167 {
1168 struct ir3_const_state const_state;
1169
1170 uint32_t constlen;
1171
1172 struct tu_push_constant_range push_consts;
1173 };
1174
1175 struct tu_pipeline_executable {
1176 gl_shader_stage stage;
1177
1178 struct ir3_info stats;
1179 bool is_binning;
1180
1181 char *nir_from_spirv;
1182 char *nir_final;
1183 char *disasm;
1184 };
1185
1186 struct tu_pipeline
1187 {
1188 struct vk_object_base base;
1189
1190 struct tu_cs cs;
1191
1192 /* Separate BO for private memory since it should GPU writable */
1193 struct tu_bo pvtmem_bo;
1194
1195 struct tu_pipeline_layout *layout;
1196
1197 bool need_indirect_descriptor_sets;
1198 VkShaderStageFlags active_stages;
1199 uint32_t active_desc_sets;
1200
1201 /* mask of enabled dynamic states
1202 * if BIT(i) is set, pipeline->dynamic_state[i] is *NOT* used
1203 */
1204 uint32_t dynamic_state_mask;
1205 struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
1206
1207 /* for dynamic states which use the same register: */
1208 uint32_t gras_su_cntl, gras_su_cntl_mask;
1209 uint32_t rb_depth_cntl, rb_depth_cntl_mask;
1210 uint32_t rb_stencil_cntl, rb_stencil_cntl_mask;
1211 uint32_t pc_raster_cntl, pc_raster_cntl_mask;
1212 uint32_t vpc_unknown_9107, vpc_unknown_9107_mask;
1213 uint32_t stencil_wrmask;
1214
1215 bool rb_depth_cntl_disable;
1216
1217 enum a5xx_line_mode line_mode;
1218
1219 /* draw states for the pipeline */
1220 struct tu_draw_state load_state, rast_state, blend_state;
1221
1222 /* for vertex buffers state */
1223 uint32_t num_vbs;
1224
1225 struct
1226 {
1227 struct tu_draw_state config_state;
1228 struct tu_draw_state state;
1229 struct tu_draw_state binning_state;
1230
1231 struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES];
1232 } program;
1233
1234 struct
1235 {
1236 struct tu_draw_state state;
1237 struct tu_draw_state binning_state;
1238 } vi;
1239
1240 struct
1241 {
1242 enum pc_di_primtype primtype;
1243 bool primitive_restart;
1244 } ia;
1245
1246 struct
1247 {
1248 uint32_t patch_type;
1249 uint32_t param_stride;
1250 uint32_t hs_bo_regid;
1251 uint32_t ds_bo_regid;
1252 bool upper_left_domain_origin;
1253 } tess;
1254
1255 struct
1256 {
1257 uint32_t local_size[3];
1258 uint32_t subgroup_size;
1259 } compute;
1260
1261 bool provoking_vertex_last;
1262
1263 struct tu_lrz_pipeline lrz;
1264
1265 void *executables_mem_ctx;
1266 /* tu_pipeline_executable */
1267 struct util_dynarray executables;
1268 };
1269
1270 void
1271 tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport, uint32_t num_viewport);
1272
1273 void
1274 tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scs, uint32_t scissor_count);
1275
1276 void
1277 tu6_clear_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image, const VkClearValue *value);
1278
1279 void
1280 tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc);
1281
1282 void
1283 tu6_emit_depth_bias(struct tu_cs *cs,
1284 float constant_factor,
1285 float clamp,
1286 float slope_factor);
1287
1288 void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
1289 enum a5xx_line_mode line_mode);
1290
1291 void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
1292
1293 void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
1294
1295 void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1296
1297 void tu6_apply_depth_bounds_workaround(struct tu_device *device,
1298 uint32_t *rb_depth_cntl);
1299
1300 struct tu_pvtmem_config {
1301 uint64_t iova;
1302 uint32_t per_fiber_size;
1303 uint32_t per_sp_size;
1304 bool per_wave;
1305 };
1306
1307 void
1308 tu6_emit_xs_config(struct tu_cs *cs,
1309 gl_shader_stage stage,
1310 const struct ir3_shader_variant *xs);
1311
1312 void
1313 tu6_emit_xs(struct tu_cs *cs,
1314 gl_shader_stage stage,
1315 const struct ir3_shader_variant *xs,
1316 const struct tu_pvtmem_config *pvtmem,
1317 uint64_t binary_iova);
1318
1319 void
1320 tu6_emit_vpc(struct tu_cs *cs,
1321 const struct ir3_shader_variant *vs,
1322 const struct ir3_shader_variant *hs,
1323 const struct ir3_shader_variant *ds,
1324 const struct ir3_shader_variant *gs,
1325 const struct ir3_shader_variant *fs,
1326 uint32_t patch_control_points);
1327
1328 void
1329 tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs);
1330
1331 struct tu_image_view;
1332
1333 void
1334 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1335 struct tu_cs *cs,
1336 const struct tu_image_view *src,
1337 const struct tu_image_view *dst,
1338 uint32_t layer_mask,
1339 uint32_t layers,
1340 const VkRect2D *rect);
1341
1342 void
1343 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
1344 struct tu_cs *cs,
1345 uint32_t a,
1346 const VkRenderPassBeginInfo *info);
1347
1348 void
1349 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
1350 struct tu_cs *cs,
1351 uint32_t a,
1352 const VkRenderPassBeginInfo *info);
1353
1354 void
1355 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
1356 struct tu_cs *cs,
1357 uint32_t a,
1358 bool force_load);
1359
1360 /* expose this function to be able to emit load without checking LOAD_OP */
1361 void
1362 tu_emit_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a);
1363
1364 /* note: gmem store can also resolve */
1365 void
1366 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
1367 struct tu_cs *cs,
1368 uint32_t a,
1369 uint32_t gmem_a);
1370
1371 struct tu_native_format
1372 {
1373 enum a6xx_format fmt : 8;
1374 enum a3xx_color_swap swap : 8;
1375 enum a6xx_tile_mode tile_mode : 8;
1376 };
1377
1378 bool tu6_format_vtx_supported(VkFormat format);
1379 struct tu_native_format tu6_format_vtx(VkFormat format);
1380 bool tu6_format_color_supported(VkFormat format);
1381 struct tu_native_format tu6_format_color(VkFormat format, enum a6xx_tile_mode tile_mode);
1382 bool tu6_format_texture_supported(VkFormat format);
1383 struct tu_native_format tu6_format_texture(VkFormat format, enum a6xx_tile_mode tile_mode);
1384
1385 static inline enum a6xx_format
tu6_base_format(VkFormat format)1386 tu6_base_format(VkFormat format)
1387 {
1388 /* note: tu6_format_color doesn't care about tiling for .fmt field */
1389 return tu6_format_color(format, TILE6_LINEAR).fmt;
1390 }
1391
1392 struct tu_image
1393 {
1394 struct vk_object_base base;
1395
1396 /* The original VkFormat provided by the client. This may not match any
1397 * of the actual surface formats.
1398 */
1399 VkFormat vk_format;
1400 uint32_t level_count;
1401 uint32_t layer_count;
1402
1403 struct fdl_layout layout[3];
1404 uint32_t total_size;
1405
1406 #ifdef ANDROID
1407 /* For VK_ANDROID_native_buffer, the WSI image owns the memory, */
1408 VkDeviceMemory owned_memory;
1409 #endif
1410
1411 /* Set when bound */
1412 struct tu_bo *bo;
1413 VkDeviceSize bo_offset;
1414
1415 uint32_t lrz_height;
1416 uint32_t lrz_pitch;
1417 uint32_t lrz_offset;
1418
1419 bool shareable;
1420 };
1421
1422 static inline uint32_t
tu_get_layerCount(const struct tu_image * image,const VkImageSubresourceRange * range)1423 tu_get_layerCount(const struct tu_image *image,
1424 const VkImageSubresourceRange *range)
1425 {
1426 return range->layerCount == VK_REMAINING_ARRAY_LAYERS
1427 ? image->layer_count - range->baseArrayLayer
1428 : range->layerCount;
1429 }
1430
1431 static inline uint32_t
tu_get_levelCount(const struct tu_image * image,const VkImageSubresourceRange * range)1432 tu_get_levelCount(const struct tu_image *image,
1433 const VkImageSubresourceRange *range)
1434 {
1435 return range->levelCount == VK_REMAINING_MIP_LEVELS
1436 ? image->level_count - range->baseMipLevel
1437 : range->levelCount;
1438 }
1439
1440 struct tu_image_view
1441 {
1442 struct vk_object_base base;
1443
1444 struct tu_image *image; /**< VkImageViewCreateInfo::image */
1445
1446 uint64_t base_addr;
1447 uint64_t ubwc_addr;
1448 uint32_t layer_size;
1449 uint32_t ubwc_layer_size;
1450
1451 /* used to determine if fast gmem store path can be used */
1452 VkExtent2D extent;
1453 bool need_y2_align;
1454
1455 bool ubwc_enabled;
1456
1457 uint32_t descriptor[A6XX_TEX_CONST_DWORDS];
1458
1459 /* Descriptor for use as a storage image as opposed to a sampled image.
1460 * This has a few differences for cube maps (e.g. type).
1461 */
1462 uint32_t storage_descriptor[A6XX_TEX_CONST_DWORDS];
1463
1464 /* pre-filled register values */
1465 uint32_t PITCH;
1466 uint32_t FLAG_BUFFER_PITCH;
1467
1468 uint32_t RB_MRT_BUF_INFO;
1469 uint32_t SP_FS_MRT_REG;
1470
1471 uint32_t SP_PS_2D_SRC_INFO;
1472 uint32_t SP_PS_2D_SRC_SIZE;
1473
1474 uint32_t RB_2D_DST_INFO;
1475
1476 uint32_t RB_BLIT_DST_INFO;
1477
1478 /* for d32s8 separate stencil */
1479 uint64_t stencil_base_addr;
1480 uint32_t stencil_layer_size;
1481 uint32_t stencil_PITCH;
1482 };
1483
1484 struct tu_sampler_ycbcr_conversion {
1485 struct vk_object_base base;
1486
1487 VkFormat format;
1488 VkSamplerYcbcrModelConversion ycbcr_model;
1489 VkSamplerYcbcrRange ycbcr_range;
1490 VkComponentMapping components;
1491 VkChromaLocation chroma_offsets[2];
1492 VkFilter chroma_filter;
1493 };
1494
1495 struct tu_sampler {
1496 struct vk_object_base base;
1497
1498 uint32_t descriptor[A6XX_TEX_SAMP_DWORDS];
1499 struct tu_sampler_ycbcr_conversion *ycbcr_sampler;
1500 };
1501
1502 void
1503 tu_cs_image_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1504
1505 void
1506 tu_cs_image_ref_2d(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer, bool src);
1507
1508 void
1509 tu_cs_image_flag_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1510
1511 void
1512 tu_cs_image_stencil_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1513
1514 #define tu_image_view_stencil(iview, x) \
1515 ((iview->x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_8_UINT))
1516
1517 VkResult
1518 tu_gralloc_info(struct tu_device *device,
1519 const VkNativeBufferANDROID *gralloc_info,
1520 int *dma_buf,
1521 uint64_t *modifier);
1522
1523 VkResult
1524 tu_import_memory_from_gralloc_handle(VkDevice device_h,
1525 int dma_buf,
1526 const VkAllocationCallbacks *alloc,
1527 VkImage image_h);
1528
1529 void
1530 tu_image_view_init(struct tu_image_view *iview,
1531 const VkImageViewCreateInfo *pCreateInfo,
1532 bool limited_z24s8);
1533
1534 bool
1535 ubwc_possible(VkFormat format, VkImageType type, VkImageUsageFlags usage, VkImageUsageFlags stencil_usage,
1536 const struct fd_dev_info *info, VkSampleCountFlagBits samples);
1537
1538 struct tu_buffer_view
1539 {
1540 struct vk_object_base base;
1541
1542 uint32_t descriptor[A6XX_TEX_CONST_DWORDS];
1543
1544 struct tu_buffer *buffer;
1545 };
1546 void
1547 tu_buffer_view_init(struct tu_buffer_view *view,
1548 struct tu_device *device,
1549 const VkBufferViewCreateInfo *pCreateInfo);
1550
1551 struct tu_attachment_info
1552 {
1553 struct tu_image_view *attachment;
1554 };
1555
1556 struct tu_framebuffer
1557 {
1558 struct vk_object_base base;
1559
1560 uint32_t width;
1561 uint32_t height;
1562 uint32_t layers;
1563
1564 /* size of the first tile */
1565 VkExtent2D tile0;
1566 /* number of tiles */
1567 VkExtent2D tile_count;
1568
1569 /* size of the first VSC pipe */
1570 VkExtent2D pipe0;
1571 /* number of VSC pipes */
1572 VkExtent2D pipe_count;
1573
1574 /* pipe register values */
1575 uint32_t pipe_config[MAX_VSC_PIPES];
1576 uint32_t pipe_sizes[MAX_VSC_PIPES];
1577
1578 uint32_t attachment_count;
1579 struct tu_attachment_info attachments[0];
1580 };
1581
1582 void
1583 tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
1584 const struct tu_device *device,
1585 const struct tu_render_pass *pass);
1586
1587 struct tu_subpass_barrier {
1588 VkPipelineStageFlags src_stage_mask;
1589 VkPipelineStageFlags dst_stage_mask;
1590 VkAccessFlags src_access_mask;
1591 VkAccessFlags dst_access_mask;
1592 bool incoherent_ccu_color, incoherent_ccu_depth;
1593 };
1594
1595 struct tu_subpass_attachment
1596 {
1597 uint32_t attachment;
1598
1599 /* For input attachments, true if it needs to be patched to refer to GMEM
1600 * in GMEM mode. This is false if it hasn't already been written as an
1601 * attachment.
1602 */
1603 bool patch_input_gmem;
1604 };
1605
1606 struct tu_subpass
1607 {
1608 uint32_t input_count;
1609 uint32_t color_count;
1610 uint32_t resolve_count;
1611 bool resolve_depth_stencil;
1612
1613 /* True if there is any feedback loop at all. */
1614 bool feedback;
1615
1616 /* True if we must invalidate UCHE thanks to a feedback loop. */
1617 bool feedback_invalidate;
1618
1619 struct tu_subpass_attachment *input_attachments;
1620 struct tu_subpass_attachment *color_attachments;
1621 struct tu_subpass_attachment *resolve_attachments;
1622 struct tu_subpass_attachment depth_stencil_attachment;
1623
1624 VkSampleCountFlagBits samples;
1625
1626 uint32_t srgb_cntl;
1627 uint32_t multiview_mask;
1628
1629 struct tu_subpass_barrier start_barrier;
1630 };
1631
1632 struct tu_render_pass_attachment
1633 {
1634 VkFormat format;
1635 uint32_t samples;
1636 uint32_t cpp;
1637 VkImageAspectFlags clear_mask;
1638 uint32_t clear_views;
1639 bool load;
1640 bool store;
1641 int32_t gmem_offset;
1642 /* for D32S8 separate stencil: */
1643 bool load_stencil;
1644 bool store_stencil;
1645 int32_t gmem_offset_stencil;
1646 };
1647
1648 struct tu_render_pass
1649 {
1650 struct vk_object_base base;
1651
1652 uint32_t attachment_count;
1653 uint32_t subpass_count;
1654 uint32_t gmem_pixels;
1655 uint32_t tile_align_w;
1656 struct tu_subpass_attachment *subpass_attachments;
1657 struct tu_render_pass_attachment *attachments;
1658 struct tu_subpass_barrier end_barrier;
1659 struct tu_subpass subpasses[0];
1660 };
1661
1662 #define PERF_CNTRS_REG 4
1663
1664 struct tu_perf_query_data
1665 {
1666 uint32_t gid; /* group-id */
1667 uint32_t cid; /* countable-id within the group */
1668 uint32_t cntr_reg; /* counter register within the group */
1669 uint32_t pass; /* pass index that countables can be requested */
1670 uint32_t app_idx; /* index provided by apps */
1671 };
1672
1673 struct tu_query_pool
1674 {
1675 struct vk_object_base base;
1676
1677 VkQueryType type;
1678 uint32_t stride;
1679 uint64_t size;
1680 uint32_t pipeline_statistics;
1681 struct tu_bo bo;
1682
1683 /* For performance query */
1684 const struct fd_perfcntr_group *perf_group;
1685 uint32_t perf_group_count;
1686 uint32_t counter_index_count;
1687 struct tu_perf_query_data perf_query_data[0];
1688 };
1689
1690 uint32_t
1691 tu_subpass_get_attachment_to_resolve(const struct tu_subpass *subpass, uint32_t index);
1692
1693 void
1694 tu_update_descriptor_sets(const struct tu_device *device,
1695 VkDescriptorSet overrideSet,
1696 uint32_t descriptorWriteCount,
1697 const VkWriteDescriptorSet *pDescriptorWrites,
1698 uint32_t descriptorCopyCount,
1699 const VkCopyDescriptorSet *pDescriptorCopies);
1700
1701 void
1702 tu_update_descriptor_set_with_template(
1703 const struct tu_device *device,
1704 struct tu_descriptor_set *set,
1705 VkDescriptorUpdateTemplate descriptorUpdateTemplate,
1706 const void *pData);
1707
1708 VkResult
1709 tu_physical_device_init(struct tu_physical_device *device,
1710 struct tu_instance *instance);
1711 VkResult
1712 tu_enumerate_devices(struct tu_instance *instance);
1713
1714 int
1715 tu_drm_get_timestamp(struct tu_physical_device *device,
1716 uint64_t *ts);
1717
1718 int
1719 tu_drm_submitqueue_new(const struct tu_device *dev,
1720 int priority,
1721 uint32_t *queue_id);
1722
1723 void
1724 tu_drm_submitqueue_close(const struct tu_device *dev, uint32_t queue_id);
1725
1726 int
1727 tu_signal_fences(struct tu_device *device, struct tu_syncobj *fence1, struct tu_syncobj *fence2);
1728
1729 int
1730 tu_syncobj_to_fd(struct tu_device *device, struct tu_syncobj *sync);
1731
1732
1733 void
1734 tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream,
1735 void *ts_from, uint32_t from_offset,
1736 void *ts_to, uint32_t to_offset,
1737 uint32_t count);
1738
1739
1740 VkResult
1741 tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
1742 struct u_trace **trace_copy);
1743
1744 struct tu_u_trace_cmd_data
1745 {
1746 struct tu_cs *timestamp_copy_cs;
1747 struct u_trace *trace;
1748 };
1749
1750 void
1751 tu_u_trace_cmd_data_finish(struct tu_device *device,
1752 struct tu_u_trace_cmd_data *trace_data,
1753 uint32_t entry_count);
1754
1755 struct tu_u_trace_flush_data
1756 {
1757 uint32_t submission_id;
1758 struct tu_u_trace_syncobj *syncobj;
1759 uint32_t trace_count;
1760 struct tu_u_trace_cmd_data *cmd_trace_data;
1761 };
1762
1763 #define TU_FROM_HANDLE(__tu_type, __name, __handle) \
1764 VK_FROM_HANDLE(__tu_type, __name, __handle)
1765
1766 VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
1767 VK_OBJECT_TYPE_COMMAND_BUFFER)
1768 VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
1769 VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
1770 VK_OBJECT_TYPE_INSTANCE)
1771 VK_DEFINE_HANDLE_CASTS(tu_physical_device, vk.base, VkPhysicalDevice,
1772 VK_OBJECT_TYPE_PHYSICAL_DEVICE)
1773 VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
1774
1775 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, base, VkCommandPool,
1776 VK_OBJECT_TYPE_COMMAND_POOL)
1777 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, base, VkBuffer,
1778 VK_OBJECT_TYPE_BUFFER)
1779 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer_view, base, VkBufferView,
1780 VK_OBJECT_TYPE_BUFFER_VIEW)
1781 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_pool, base, VkDescriptorPool,
1782 VK_OBJECT_TYPE_DESCRIPTOR_POOL)
1783 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set, base, VkDescriptorSet,
1784 VK_OBJECT_TYPE_DESCRIPTOR_SET)
1785 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set_layout, base,
1786 VkDescriptorSetLayout,
1787 VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT)
1788 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_update_template, base,
1789 VkDescriptorUpdateTemplate,
1790 VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE)
1791 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, base, VkDeviceMemory,
1792 VK_OBJECT_TYPE_DEVICE_MEMORY)
1793 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
1794 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, base, VkFramebuffer,
1795 VK_OBJECT_TYPE_FRAMEBUFFER)
1796 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image, base, VkImage, VK_OBJECT_TYPE_IMAGE)
1797 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image_view, base, VkImageView,
1798 VK_OBJECT_TYPE_IMAGE_VIEW);
1799 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_cache, base, VkPipelineCache,
1800 VK_OBJECT_TYPE_PIPELINE_CACHE)
1801 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline, base, VkPipeline,
1802 VK_OBJECT_TYPE_PIPELINE)
1803 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_layout, base, VkPipelineLayout,
1804 VK_OBJECT_TYPE_PIPELINE_LAYOUT)
1805 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_query_pool, base, VkQueryPool,
1806 VK_OBJECT_TYPE_QUERY_POOL)
1807 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_render_pass, base, VkRenderPass,
1808 VK_OBJECT_TYPE_RENDER_PASS)
1809 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, base, VkSampler,
1810 VK_OBJECT_TYPE_SAMPLER)
1811 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler_ycbcr_conversion, base, VkSamplerYcbcrConversion,
1812 VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION)
1813
1814 /* for TU_FROM_HANDLE with both VkFence and VkSemaphore: */
1815 #define tu_syncobj_from_handle(x) ((struct tu_syncobj*) (uintptr_t) (x))
1816
1817 void
1818 update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask);
1819
1820 #endif /* TU_PRIVATE_H */
1821