• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2009 Corbin Simpson
3  * Copyright © 2015 Advanced Micro Devices, Inc.
4  *
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #ifndef AMDGPU_WINSYS_H
9 #define AMDGPU_WINSYS_H
10 
11 #include "pipebuffer/pb_cache.h"
12 #include "pipebuffer/pb_slab.h"
13 #include "winsys/radeon_winsys.h"
14 #include "util/simple_mtx.h"
15 #include "util/u_queue.h"
16 #include "ac_linux_drm.h"
17 #include <amdgpu.h>
18 #include "amdgpu_userq.h"
19 
20 struct amdgpu_cs;
21 
22 /* DRM file descriptors, file descriptions and buffer sharing.
23  *
24  * amdgpu_device_initialize() creates one amdgpu_device_handle for one
25  * gpu. It does this by getting sysfs path(eg /dev/dri/cardxx) for the fd.
26  * It uses the sysfs path to return the amdgpu_device_handle if already created
27  * or to create new one.
28  *
29  * Thus amdgpu_device_handle's fd will be from the first time the gpu
30  * was initialized by amdgpu_device_initialize().
31  *
32  * KMS/GEM buffer handles are specific to a DRM file description. i.e. the
33  * same handle value may refer to different underlying BOs in different
34  * DRM file descriptions even for the same gpu. The
35  * https://en.wikipedia.org/wiki/File:File_table_and_inode_table.svg diagram shows
36  * the file descriptors and its relation to file descriptions in the file table.
37  *
38  * The fd's are considered different if the fd's are obtained using open()
39  * function. The fd's that are duplicates(using dup() or fcntl F_DUPFD) of
40  * open fd, all will be same when compared with os_same_file_description()
41  * function which uses kcmp system call.
42  *
43  * amdgpu_screen_winsys's fd tracks the file description which was
44  * given to amdgpu_winsys_create(). This is the fd used by the application
45  * using the driver and may be used in other ioctl (eg: drmModeAddFB)
46  *
47  * amdgpu_winsys's fd is the file description used to initialize the
48  * device handle in libdrm_amdgpu.
49  *
50  * The 2 fds can be different, even in systems with a single GPU, eg: if
51  * radv is initialized before radeonsi.
52  *
53  * This fd tracking is useful for buffer sharing. As an example, if an app
54  * wants to use drmModeAddFB it'll need a KMS handle valid for its
55  * fd (== amdgpu_screen_winsys::fd). If both fds are identical, there's
56  * nothing to do: bo->u.real.kms_handle can be used directly
57  * (see amdgpu_bo_get_handle). If they're different, the BO has to be exported
58  * from the device fd as a dma-buf, then imported to the app fd to get the
59  * KMS handle of the buffer for that app fd.
60  *
61  * Examples:
62  * 1) OpenGL, then VAAPI:
63  *    OpenGL                             | VAAPI (same device, != file description)
64  *    -----------------------------------│-----------------------------------------
65  *    fd = 5 (/dev/dri/renderD128)       │fd = 9 (/dev/dri/renderD128')
66  *          │                            │       │
67  *     device_handle = 0xffff0250        │ device_handle = 0xffff0250 (fd=5, re-used)
68  *          │                            │       │
69  *    amdgpu_screen_winsys = 0xffff0120  │amdgpu_winsys = 0xffff0470  ◄─────────────┐
70  *          │   ├─ fd = dup(5) = 6       │       │   └─ sws_list = 0xffff0120       │
71  *          │   └─ aws = 0xffff0470 ◄──┐ │       │                 0xffff0640 ◄───┐ │
72  *          │                          │ │amdgpu_screen_winsys = 0xffff0640 ──────┘ │
73  *    amdgpu_winsys = 0xffff0470    ───┘ │           └─ fd = dup(9) = 10            │
74  *          │   ├─ dev = 0xffff0250      │                                          │
75  *          │   ├─ sws_list = 0xffff0120 │                                          │
76  *          │   └─ fd = 6                │                                          │
77  *    dev_tab(0xffff0250) = 0xffff0470 ──│──────────────────────────────────────────┘
78  *
79  * 2) Vulkan (fd=5) then OpenGL (same device, != file description):
80  *    -----------------------------
81  *    fd = 9 (/dev/dri/renderD128)
82  *           │
83  *     device_handle = 0xffff0250 (fd=5, re-used)
84  *           │
85  *    amdgpu_screen_winsys = 0xffff0740
86  *           │   ├─ fd = dup(9) = 10
87  *           │   └─ aws = 0xffff0940 ◄───┐
88  *    amdgpu_winsys = 0xffff0940 ────────┘
89  *           │   ├─ dev = 0xffff0250
90  *           │   ├─ sws_list = 0xffff0740
91  *           │   └─ fd = 5
92  *    dev_tab(0xffff0250) = 0xffff0940
93  */
94 
95 /* One struct amdgpu_screen_winsys is created in amdgpu_winsys_create() for one
96  * fd. For fd's that are same (read above description for same if condition),
97  * already created amdgpu_screen_winsys will be returned.
98  */
99 struct amdgpu_screen_winsys {
100    struct radeon_winsys base;
101    struct amdgpu_winsys *aws;
102    /* See comment above */
103    int fd;
104    struct pipe_reference reference;
105    struct amdgpu_screen_winsys *next;
106 
107    /* Maps a BO to its KMS handle valid for this DRM file descriptor
108     * Protected by amdgpu_winsys::sws_list_lock
109     */
110    struct hash_table *kms_handles;
111 };
112 
113 /* Maximum this number of IBs can be busy per queue. When submitting a new IB and the oldest IB
114  * ("AMDGPU_FENCE_RING_SIZE" IBs ago) is still busy, the CS thread will wait for it and will
115  * also block all queues from submitting new IBs.
116  */
117 #define AMDGPU_FENCE_RING_SIZE 32
118 
119 /* The maximum number of queues that can be present. */
120 #define AMDGPU_MAX_QUEUES 6
121 
122 /* This can use any integer type because the logic handles integer wraparounds robustly, but
123  * uint8_t wraps around so quickly that some BOs might never become idle because we don't
124  * remove idle fences from BOs, so they become "busy" again after a queue sequence number wraps
125  * around and they may stay "busy" in pb_cache long enough that we run out of memory.
126  */
127 typedef uint16_t uint_seq_no;
128 
129 struct amdgpu_queue {
130    /* Ring buffer of fences.
131     *
132     * We only remember a certain number of the most recent fences per queue. When we add a new
133     * fence, we wait for the oldest one, which implies that all older fences not present
134     * in the ring are idle. This way we don't have to keep track of a million fence references
135     * for a million BOs.
136     *
137     * We only support 1 queue per IP. If an IP has multiple queues, we always add a fence
138     * dependency on the previous fence to make it behave like there is only 1 queue.
139     *
140     * amdgpu_winsys_bo doesn't have a list of fences. It only remembers the last sequence number
141     * for every queue where it was used. We then use the BO's sequence number to look up a fence
142     * in this ring.
143     */
144    struct pipe_fence_handle *fences[AMDGPU_FENCE_RING_SIZE];
145 
146    /* The sequence number of the latest fence.
147     *
148     * This sequence number is global per queue per device, shared by all contexts, and generated
149     * by the winsys, not the kernel.
150     *
151     * The latest fence is: fences[latest_seq_no % AMDGPU_FENCE_RING_SIZE]
152     * The oldest fence is: fences([latest_seq_no + 1) % AMDGPU_FENCE_RING_SIZE]
153     * The oldest sequence number in the ring: latest_seq_no - AMDGPU_FENCE_RING_SIZE + 1
154     *
155     * The sequence number is in the ring if:
156     *    latest_seq_no - buffer_seq_no < AMDGPU_FENCE_RING_SIZE
157     * If the sequence number is not in the ring, it's idle.
158     *
159     * Integer wraparounds of the sequence number behave as follows:
160     *
161     * The comparison above gives the correct answer if buffer_seq_no isn't older than UINT*_MAX.
162     * If it's older than UINT*_MAX but not older than UINT*_MAX + AMDGPU_FENCE_RING_SIZE, we
163     * incorrectly pick and wait for one of the fences in the ring. That's only a problem when
164     * the type is so small (uint8_t) that seq_no wraps around very frequently, causing BOs to
165     * never become idle in certain very unlucky scenarios and running out of memory.
166     */
167    uint_seq_no latest_seq_no;
168 
169    /* The last context using this queue. */
170    struct amdgpu_ctx *last_ctx;
171 
172    struct amdgpu_userq userq;
173 };
174 
175 /* This is part of every BO. */
176 struct amdgpu_seq_no_fences {
177    /* A fence sequence number per queue. This number is used to look up the fence from
178     * struct amdgpu_queue.
179     *
180     * This sequence number is global per queue per device, shared by all contexts, and generated
181     * by the winsys, not the kernel.
182     */
183    uint_seq_no seq_no[AMDGPU_MAX_QUEUES];
184 
185    /* The mask of queues where seq_no[i] is valid. */
186    uint8_t valid_fence_mask;
187 };
188 
189 /* valid_fence_mask should have 1 bit for each queue. */
190 static_assert(sizeof(((struct amdgpu_seq_no_fences*)NULL)->valid_fence_mask) * 8 >= AMDGPU_MAX_QUEUES, "");
191 
192 /* One struct amdgpu_winsys is created for one gpu in amdgpu_winsys_create(). */
193 struct amdgpu_winsys {
194    struct pipe_reference reference;
195    /* See comment above */
196    int fd;
197 
198    /* Protected by bo_fence_lock. */
199    struct amdgpu_queue queues[AMDGPU_MAX_QUEUES];
200 
201    struct pb_cache bo_cache;
202    struct pb_slabs bo_slabs;  /* Slab allocator. */
203 
204    ac_drm_device *dev;
205 
206    simple_mtx_t bo_fence_lock;
207 
208    int num_cs; /* The number of command streams created. */
209    uint32_t surf_index_color;
210    uint32_t surf_index_fmask;
211    uint32_t next_bo_unique_id;
212    uint64_t allocated_vram;
213    uint64_t allocated_gtt;
214    uint64_t mapped_vram;
215    uint64_t mapped_gtt;
216    uint64_t slab_wasted_vram;
217    uint64_t slab_wasted_gtt;
218    uint64_t buffer_wait_time; /* time spent in buffer_wait in ns */
219    uint64_t num_gfx_IBs;
220    uint64_t num_sdma_IBs;
221    uint64_t num_mapped_buffers;
222    uint64_t gfx_bo_list_counter;
223    uint64_t gfx_ib_size_counter;
224 
225    struct radeon_info info;
226 
227    /* multithreaded IB submission */
228    struct util_queue cs_queue;
229 
230    struct ac_addrlib *addrlib;
231 
232    bool check_vm;
233    bool noop_cs;
234    bool reserve_vmid;
235    bool zero_all_vram_allocs;
236 #if MESA_DEBUG
237    bool debug_all_bos;
238 
239    /* List of all allocated buffers */
240    simple_mtx_t global_bo_list_lock;
241    struct list_head global_bo_list;
242    unsigned num_buffers;
243 #endif
244 
245    /* Single-linked list of all structs amdgpu_screen_winsys referencing this
246     * struct amdgpu_winsys
247     */
248    simple_mtx_t sws_list_lock;
249    struct amdgpu_screen_winsys *sws_list;
250 
251    /* For returning the same amdgpu_winsys_bo instance for exported
252     * and re-imported buffers. */
253    struct hash_table *bo_export_table;
254    simple_mtx_t bo_export_table_lock;
255 
256    /* Since most winsys functions require struct radeon_winsys *, dummy_sws.base is used
257     * for invoking them because sws_list can be NULL.
258     */
259    struct amdgpu_screen_winsys dummy_sws;
260 
261    /*
262     * In case of userqueue, mesa should ensure that VM page tables are available
263     * when jobs are executed. For this, VM ioctl now outputs timeline syncobj.
264     * This timeline syncobj output will be used as one of the dependency
265     * fence in userqueue wait ioctl.
266     */
267    uint32_t vm_timeline_syncobj;
268    uint64_t vm_timeline_seq_num;
269    simple_mtx_t vm_ioctl_lock;
270 };
271 
272 static inline struct amdgpu_screen_winsys *
amdgpu_screen_winsys(struct radeon_winsys * base)273 amdgpu_screen_winsys(struct radeon_winsys *base)
274 {
275    return (struct amdgpu_screen_winsys*)base;
276 }
277 
278 static inline struct amdgpu_winsys *
amdgpu_winsys(struct radeon_winsys * base)279 amdgpu_winsys(struct radeon_winsys *base)
280 {
281    return amdgpu_screen_winsys(base)->aws;
282 }
283 
284 void amdgpu_surface_init_functions(struct amdgpu_screen_winsys *sws);
285 
286 #endif
287