1 /*
2 * Copyright © 2018 Google, Inc.
3 * Copyright © 2015 Intel Corporation
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include <fcntl.h>
8 #include <sys/mman.h>
9 #include <xf86drm.h>
10
11 #include "tu_knl_drm.h"
12 #include "tu_device.h"
13
14 static inline void
tu_sync_cacheline_to_gpu(void const * p)15 tu_sync_cacheline_to_gpu(void const *p __attribute__((unused)))
16 {
17 #if DETECT_ARCH_AARCH64
18 /* Clean data cache. */
19 __asm volatile("dc cvac, %0" : : "r" (p) : "memory");
20 #elif (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
21 __builtin_ia32_clflush(p);
22 #elif DETECT_ARCH_ARM
23 /* DCCMVAC - same as DC CVAC on aarch64.
24 * Seems to be illegal to call from userspace.
25 */
26 //__asm volatile("mcr p15, 0, %0, c7, c10, 1" : : "r" (p) : "memory");
27 unreachable("Cache line clean is unsupported on ARMv7");
28 #endif
29 }
30
31 static inline void
tu_sync_cacheline_from_gpu(void const * p)32 tu_sync_cacheline_from_gpu(void const *p __attribute__((unused)))
33 {
34 #if DETECT_ARCH_AARCH64
35 /* Clean and Invalidate data cache, there is no separate Invalidate. */
36 __asm volatile("dc civac, %0" : : "r" (p) : "memory");
37 #elif (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
38 __builtin_ia32_clflush(p);
39 #elif DETECT_ARCH_ARM
40 /* DCCIMVAC - same as DC CIVAC on aarch64.
41 * Seems to be illegal to call from userspace.
42 */
43 //__asm volatile("mcr p15, 0, %0, c7, c14, 1" : : "r" (p) : "memory");
44 unreachable("Cache line invalidate is unsupported on ARMv7");
45 #endif
46 }
47
48 void
tu_sync_cache_bo(struct tu_device * dev,struct tu_bo * bo,VkDeviceSize offset,VkDeviceSize size,enum tu_mem_sync_op op)49 tu_sync_cache_bo(struct tu_device *dev,
50 struct tu_bo *bo,
51 VkDeviceSize offset,
52 VkDeviceSize size,
53 enum tu_mem_sync_op op)
54 {
55 uintptr_t level1_dcache_size = dev->physical_device->level1_dcache_size;
56 char *start = (char *) bo->map + offset;
57 char *end = start + (size == VK_WHOLE_SIZE ? (bo->size - offset) : size);
58
59 start = (char *) ((uintptr_t) start & ~(level1_dcache_size - 1));
60
61 for (; start < end; start += level1_dcache_size) {
62 if (op == TU_MEM_SYNC_CACHE_TO_GPU) {
63 tu_sync_cacheline_to_gpu(start);
64 } else {
65 tu_sync_cacheline_from_gpu(start);
66 }
67 }
68 }
69
70 static VkResult
sync_cache(VkDevice _device,enum tu_mem_sync_op op,uint32_t count,const VkMappedMemoryRange * ranges)71 sync_cache(VkDevice _device,
72 enum tu_mem_sync_op op,
73 uint32_t count,
74 const VkMappedMemoryRange *ranges)
75 {
76 TU_FROM_HANDLE(tu_device, device, _device);
77
78 if (!device->physical_device->has_cached_non_coherent_memory) {
79 tu_finishme(
80 "data cache clean and invalidation are unsupported on this arch!");
81 return VK_SUCCESS;
82 }
83
84 for (uint32_t i = 0; i < count; i++) {
85 TU_FROM_HANDLE(tu_device_memory, mem, ranges[i].memory);
86 tu_sync_cache_bo(device, mem->bo, ranges[i].offset, ranges[i].size, op);
87 }
88
89 return VK_SUCCESS;
90 }
91
92 VkResult
tu_FlushMappedMemoryRanges(VkDevice _device,uint32_t memoryRangeCount,const VkMappedMemoryRange * pMemoryRanges)93 tu_FlushMappedMemoryRanges(VkDevice _device,
94 uint32_t memoryRangeCount,
95 const VkMappedMemoryRange *pMemoryRanges)
96 {
97 return sync_cache(_device, TU_MEM_SYNC_CACHE_TO_GPU, memoryRangeCount,
98 pMemoryRanges);
99 }
100
101 VkResult
tu_InvalidateMappedMemoryRanges(VkDevice _device,uint32_t memoryRangeCount,const VkMappedMemoryRange * pMemoryRanges)102 tu_InvalidateMappedMemoryRanges(VkDevice _device,
103 uint32_t memoryRangeCount,
104 const VkMappedMemoryRange *pMemoryRanges)
105 {
106 return sync_cache(_device, TU_MEM_SYNC_CACHE_FROM_GPU, memoryRangeCount,
107 pMemoryRanges);
108 }
109
110 VkResult
tu_allocate_userspace_iova(struct tu_device * dev,uint64_t size,uint64_t client_iova,enum tu_bo_alloc_flags flags,uint64_t * iova)111 tu_allocate_userspace_iova(struct tu_device *dev,
112 uint64_t size,
113 uint64_t client_iova,
114 enum tu_bo_alloc_flags flags,
115 uint64_t *iova)
116 {
117 *iova = 0;
118
119 if (flags & TU_BO_ALLOC_REPLAYABLE) {
120 if (client_iova) {
121 if (util_vma_heap_alloc_addr(&dev->vma, client_iova, size)) {
122 *iova = client_iova;
123 } else {
124 return VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS;
125 }
126 } else {
127 /* We have to separate replayable IOVAs from ordinary one in order to
128 * for them not to clash. The easiest way to do this is to allocate
129 * them from the other end of the address space.
130 */
131 dev->vma.alloc_high = true;
132 *iova = util_vma_heap_alloc(&dev->vma, size, 0x1000);
133 }
134 } else {
135 dev->vma.alloc_high = false;
136 *iova = util_vma_heap_alloc(&dev->vma, size, 0x1000);
137 }
138
139 if (!*iova)
140 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
141
142 return VK_SUCCESS;
143 }
144
145 int
tu_drm_export_dmabuf(struct tu_device * dev,struct tu_bo * bo)146 tu_drm_export_dmabuf(struct tu_device *dev, struct tu_bo *bo)
147 {
148 int prime_fd;
149 int ret = drmPrimeHandleToFD(dev->fd, bo->gem_handle,
150 DRM_CLOEXEC | DRM_RDWR, &prime_fd);
151
152 return ret == 0 ? prime_fd : -1;
153 }
154
155 void
tu_drm_bo_finish(struct tu_device * dev,struct tu_bo * bo)156 tu_drm_bo_finish(struct tu_device *dev, struct tu_bo *bo)
157 {
158 assert(bo->gem_handle);
159
160 u_rwlock_rdlock(&dev->dma_bo_lock);
161
162 if (!p_atomic_dec_zero(&bo->refcnt)) {
163 u_rwlock_rdunlock(&dev->dma_bo_lock);
164 return;
165 }
166
167 if (bo->map)
168 munmap(bo->map, bo->size);
169
170 tu_debug_bos_del(dev, bo);
171
172 mtx_lock(&dev->bo_mutex);
173 dev->bo_count--;
174 dev->bo_list[bo->bo_list_idx] = dev->bo_list[dev->bo_count];
175
176 struct tu_bo* exchanging_bo = tu_device_lookup_bo(dev, dev->bo_list[bo->bo_list_idx].handle);
177 exchanging_bo->bo_list_idx = bo->bo_list_idx;
178
179 if (bo->implicit_sync)
180 dev->implicit_sync_bo_count--;
181
182 mtx_unlock(&dev->bo_mutex);
183
184 if (dev->physical_device->has_set_iova) {
185 mtx_lock(&dev->vma_mutex);
186 struct tu_zombie_vma *vma = (struct tu_zombie_vma *)
187 u_vector_add(&dev->zombie_vmas);
188 vma->gem_handle = bo->gem_handle;
189 #ifdef TU_HAS_VIRTIO
190 vma->res_id = bo->res_id;
191 #endif
192 vma->iova = bo->iova;
193 vma->size = bo->size;
194 vma->fence = p_atomic_read(&dev->queues[0]->fence);
195
196 /* Must be cleared under the VMA mutex, or another thread could race to
197 * reap the VMA, closing the BO and letting a new GEM allocation produce
198 * this handle again.
199 */
200 memset(bo, 0, sizeof(*bo));
201 mtx_unlock(&dev->vma_mutex);
202 } else {
203 /* Our BO structs are stored in a sparse array in the physical device,
204 * so we don't want to free the BO pointer, instead we want to reset it
205 * to 0, to signal that array entry as being free.
206 */
207 uint32_t gem_handle = bo->gem_handle;
208 memset(bo, 0, sizeof(*bo));
209
210 /* Note that virtgpu GEM_CLOSE path is a bit different, but it does
211 * not use the !has_set_iova path so we can ignore that
212 */
213 struct drm_gem_close req = {
214 .handle = gem_handle,
215 };
216
217 drmIoctl(dev->fd, DRM_IOCTL_GEM_CLOSE, &req);
218 }
219
220 u_rwlock_rdunlock(&dev->dma_bo_lock);
221 }
222
223 uint32_t
tu_syncobj_from_vk_sync(struct vk_sync * sync)224 tu_syncobj_from_vk_sync(struct vk_sync *sync)
225 {
226 uint32_t syncobj = -1;
227 if (vk_sync_is_tu_timeline_sync(sync)) {
228 syncobj = to_tu_timeline_sync(sync)->syncobj;
229 } else if (vk_sync_type_is_drm_syncobj(sync->type)) {
230 syncobj = vk_sync_as_drm_syncobj(sync)->syncobj;
231 }
232
233 assert(syncobj != -1);
234
235 return syncobj;
236 }
237
238 static VkResult
tu_timeline_sync_init(struct vk_device * vk_device,struct vk_sync * vk_sync,uint64_t initial_value)239 tu_timeline_sync_init(struct vk_device *vk_device,
240 struct vk_sync *vk_sync,
241 uint64_t initial_value)
242 {
243 struct tu_device *device = container_of(vk_device, struct tu_device, vk);
244 struct tu_timeline_sync *sync = to_tu_timeline_sync(vk_sync);
245 uint32_t flags = 0;
246
247 assert(device->fd >= 0);
248
249 int err = drmSyncobjCreate(device->fd, flags, &sync->syncobj);
250
251 if (err < 0) {
252 return vk_error(device, VK_ERROR_DEVICE_LOST);
253 }
254
255 sync->state = initial_value ? TU_TIMELINE_SYNC_STATE_SIGNALED :
256 TU_TIMELINE_SYNC_STATE_RESET;
257
258 return VK_SUCCESS;
259 }
260
261 static void
tu_timeline_sync_finish(struct vk_device * vk_device,struct vk_sync * vk_sync)262 tu_timeline_sync_finish(struct vk_device *vk_device,
263 struct vk_sync *vk_sync)
264 {
265 struct tu_device *dev = container_of(vk_device, struct tu_device, vk);
266 struct tu_timeline_sync *sync = to_tu_timeline_sync(vk_sync);
267
268 assert(dev->fd >= 0);
269 ASSERTED int err = drmSyncobjDestroy(dev->fd, sync->syncobj);
270 assert(err == 0);
271 }
272
273 static VkResult
tu_timeline_sync_reset(struct vk_device * vk_device,struct vk_sync * vk_sync)274 tu_timeline_sync_reset(struct vk_device *vk_device,
275 struct vk_sync *vk_sync)
276 {
277 struct tu_device *dev = container_of(vk_device, struct tu_device, vk);
278 struct tu_timeline_sync *sync = to_tu_timeline_sync(vk_sync);
279
280 int err = drmSyncobjReset(dev->fd, &sync->syncobj, 1);
281 if (err) {
282 return vk_errorf(dev, VK_ERROR_UNKNOWN,
283 "DRM_IOCTL_SYNCOBJ_RESET failed: %m");
284 } else {
285 sync->state = TU_TIMELINE_SYNC_STATE_RESET;
286 }
287
288 return VK_SUCCESS;
289 }
290
291 static VkResult
drm_syncobj_wait(struct tu_device * device,uint32_t * handles,uint32_t count_handles,uint64_t timeout_nsec,bool wait_all)292 drm_syncobj_wait(struct tu_device *device,
293 uint32_t *handles, uint32_t count_handles,
294 uint64_t timeout_nsec, bool wait_all)
295 {
296 uint32_t syncobj_wait_flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT;
297 if (wait_all) syncobj_wait_flags |= DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL;
298
299 /* syncobj absolute timeouts are signed. clamp OS_TIMEOUT_INFINITE down. */
300 timeout_nsec = MIN2(timeout_nsec, (uint64_t)INT64_MAX);
301
302 int err = drmSyncobjWait(device->fd, handles,
303 count_handles, timeout_nsec,
304 syncobj_wait_flags,
305 NULL /* first_signaled */);
306 if (err && errno == ETIME) {
307 return VK_TIMEOUT;
308 } else if (err) {
309 return vk_errorf(device, VK_ERROR_UNKNOWN,
310 "DRM_IOCTL_SYNCOBJ_WAIT failed: %m");
311 }
312
313 return VK_SUCCESS;
314 }
315
316 /* Based on anv_bo_sync_wait */
317 static VkResult
tu_timeline_sync_wait(struct vk_device * vk_device,uint32_t wait_count,const struct vk_sync_wait * waits,enum vk_sync_wait_flags wait_flags,uint64_t abs_timeout_ns)318 tu_timeline_sync_wait(struct vk_device *vk_device,
319 uint32_t wait_count,
320 const struct vk_sync_wait *waits,
321 enum vk_sync_wait_flags wait_flags,
322 uint64_t abs_timeout_ns)
323 {
324 struct tu_device *dev = container_of(vk_device, struct tu_device, vk);
325 bool wait_all = !(wait_flags & VK_SYNC_WAIT_ANY);
326
327 uint32_t handles[wait_count];
328 uint32_t submit_count;
329 VkResult ret = VK_SUCCESS;
330 uint32_t pending = wait_count;
331 struct tu_timeline_sync *submitted_syncs[wait_count];
332
333 while (pending) {
334 pending = 0;
335 submit_count = 0;
336
337 for (unsigned i = 0; i < wait_count; ++i) {
338 struct tu_timeline_sync *sync = to_tu_timeline_sync(waits[i].sync);
339
340 if (sync->state == TU_TIMELINE_SYNC_STATE_RESET) {
341 assert(!(wait_flags & VK_SYNC_WAIT_PENDING));
342 pending++;
343 } else if (sync->state == TU_TIMELINE_SYNC_STATE_SIGNALED) {
344 if (wait_flags & VK_SYNC_WAIT_ANY)
345 return VK_SUCCESS;
346 } else if (sync->state == TU_TIMELINE_SYNC_STATE_SUBMITTED) {
347 if (!(wait_flags & VK_SYNC_WAIT_PENDING)) {
348 handles[submit_count] = sync->syncobj;
349 submitted_syncs[submit_count++] = sync;
350 }
351 }
352 }
353
354 if (submit_count > 0) {
355 do {
356 ret = drm_syncobj_wait(dev, handles, submit_count, abs_timeout_ns, wait_all);
357 } while (ret == VK_TIMEOUT && os_time_get_nano() < abs_timeout_ns);
358
359 if (ret == VK_SUCCESS) {
360 for (unsigned i = 0; i < submit_count; ++i) {
361 struct tu_timeline_sync *sync = submitted_syncs[i];
362 sync->state = TU_TIMELINE_SYNC_STATE_SIGNALED;
363 }
364 } else {
365 /* return error covering timeout */
366 return ret;
367 }
368 } else if (pending > 0) {
369 /* If we've hit this then someone decided to vkWaitForFences before
370 * they've actually submitted any of them to a queue. This is a
371 * fairly pessimal case, so it's ok to lock here and use a standard
372 * pthreads condition variable.
373 */
374 pthread_mutex_lock(&dev->submit_mutex);
375
376 /* It's possible that some of the fences have changed state since the
377 * last time we checked. Now that we have the lock, check for
378 * pending fences again and don't wait if it's changed.
379 */
380 uint32_t now_pending = 0;
381 for (uint32_t i = 0; i < wait_count; i++) {
382 struct tu_timeline_sync *sync = to_tu_timeline_sync(waits[i].sync);
383 if (sync->state == TU_TIMELINE_SYNC_STATE_RESET)
384 now_pending++;
385 }
386 assert(now_pending <= pending);
387
388 if (now_pending == pending) {
389 struct timespec abstime = {
390 .tv_sec = abs_timeout_ns / NSEC_PER_SEC,
391 .tv_nsec = abs_timeout_ns % NSEC_PER_SEC,
392 };
393
394 ASSERTED int ret;
395 ret = pthread_cond_timedwait(&dev->timeline_cond,
396 &dev->submit_mutex, &abstime);
397 assert(ret != EINVAL);
398 if (os_time_get_nano() >= abs_timeout_ns) {
399 pthread_mutex_unlock(&dev->submit_mutex);
400 return VK_TIMEOUT;
401 }
402 }
403
404 pthread_mutex_unlock(&dev->submit_mutex);
405 }
406 }
407
408 return ret;
409 }
410
411 const struct vk_sync_type tu_timeline_sync_type = {
412 .size = sizeof(struct tu_timeline_sync),
413 .features = (enum vk_sync_features)(
414 VK_SYNC_FEATURE_BINARY | VK_SYNC_FEATURE_GPU_WAIT |
415 VK_SYNC_FEATURE_GPU_MULTI_WAIT | VK_SYNC_FEATURE_CPU_WAIT |
416 VK_SYNC_FEATURE_CPU_RESET | VK_SYNC_FEATURE_WAIT_ANY |
417 VK_SYNC_FEATURE_WAIT_PENDING),
418 .init = tu_timeline_sync_init,
419 .finish = tu_timeline_sync_finish,
420 .reset = tu_timeline_sync_reset,
421 .wait_many = tu_timeline_sync_wait,
422 };
423