• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Google, Inc.
3  * Copyright © 2015 Intel Corporation
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include <fcntl.h>
8 #include <sys/mman.h>
9 #include <xf86drm.h>
10 
11 #include "tu_knl_drm.h"
12 #include "tu_device.h"
13 
14 static inline void
tu_sync_cacheline_to_gpu(void const * p)15 tu_sync_cacheline_to_gpu(void const *p __attribute__((unused)))
16 {
17 #if DETECT_ARCH_AARCH64
18    /* Clean data cache. */
19    __asm volatile("dc cvac, %0" : : "r" (p) : "memory");
20 #elif (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
21    __builtin_ia32_clflush(p);
22 #elif DETECT_ARCH_ARM
23    /* DCCMVAC - same as DC CVAC on aarch64.
24     * Seems to be illegal to call from userspace.
25     */
26    //__asm volatile("mcr p15, 0, %0, c7, c10, 1" : : "r" (p) : "memory");
27    unreachable("Cache line clean is unsupported on ARMv7");
28 #endif
29 }
30 
31 static inline void
tu_sync_cacheline_from_gpu(void const * p)32 tu_sync_cacheline_from_gpu(void const *p __attribute__((unused)))
33 {
34 #if DETECT_ARCH_AARCH64
35    /* Clean and Invalidate data cache, there is no separate Invalidate. */
36    __asm volatile("dc civac, %0" : : "r" (p) : "memory");
37 #elif (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
38    __builtin_ia32_clflush(p);
39 #elif DETECT_ARCH_ARM
40    /* DCCIMVAC - same as DC CIVAC on aarch64.
41     * Seems to be illegal to call from userspace.
42     */
43    //__asm volatile("mcr p15, 0, %0, c7, c14, 1" : : "r" (p) : "memory");
44    unreachable("Cache line invalidate is unsupported on ARMv7");
45 #endif
46 }
47 
48 void
tu_sync_cache_bo(struct tu_device * dev,struct tu_bo * bo,VkDeviceSize offset,VkDeviceSize size,enum tu_mem_sync_op op)49 tu_sync_cache_bo(struct tu_device *dev,
50                  struct tu_bo *bo,
51                  VkDeviceSize offset,
52                  VkDeviceSize size,
53                  enum tu_mem_sync_op op)
54 {
55    uintptr_t level1_dcache_size = dev->physical_device->level1_dcache_size;
56    char *start = (char *) bo->map + offset;
57    char *end = start + (size == VK_WHOLE_SIZE ? (bo->size - offset) : size);
58 
59    start = (char *) ((uintptr_t) start & ~(level1_dcache_size - 1));
60 
61    for (; start < end; start += level1_dcache_size) {
62       if (op == TU_MEM_SYNC_CACHE_TO_GPU) {
63          tu_sync_cacheline_to_gpu(start);
64       } else {
65          tu_sync_cacheline_from_gpu(start);
66       }
67    }
68 }
69 
70 static VkResult
sync_cache(VkDevice _device,enum tu_mem_sync_op op,uint32_t count,const VkMappedMemoryRange * ranges)71 sync_cache(VkDevice _device,
72            enum tu_mem_sync_op op,
73            uint32_t count,
74            const VkMappedMemoryRange *ranges)
75 {
76    TU_FROM_HANDLE(tu_device, device, _device);
77 
78    if (!device->physical_device->has_cached_non_coherent_memory) {
79       tu_finishme(
80          "data cache clean and invalidation are unsupported on this arch!");
81       return VK_SUCCESS;
82    }
83 
84    for (uint32_t i = 0; i < count; i++) {
85       TU_FROM_HANDLE(tu_device_memory, mem, ranges[i].memory);
86       tu_sync_cache_bo(device, mem->bo, ranges[i].offset, ranges[i].size, op);
87    }
88 
89    return VK_SUCCESS;
90 }
91 
92 VkResult
tu_FlushMappedMemoryRanges(VkDevice _device,uint32_t memoryRangeCount,const VkMappedMemoryRange * pMemoryRanges)93 tu_FlushMappedMemoryRanges(VkDevice _device,
94                            uint32_t memoryRangeCount,
95                            const VkMappedMemoryRange *pMemoryRanges)
96 {
97    return sync_cache(_device, TU_MEM_SYNC_CACHE_TO_GPU, memoryRangeCount,
98                      pMemoryRanges);
99 }
100 
101 VkResult
tu_InvalidateMappedMemoryRanges(VkDevice _device,uint32_t memoryRangeCount,const VkMappedMemoryRange * pMemoryRanges)102 tu_InvalidateMappedMemoryRanges(VkDevice _device,
103                                 uint32_t memoryRangeCount,
104                                 const VkMappedMemoryRange *pMemoryRanges)
105 {
106    return sync_cache(_device, TU_MEM_SYNC_CACHE_FROM_GPU, memoryRangeCount,
107                      pMemoryRanges);
108 }
109 
110 VkResult
tu_allocate_userspace_iova(struct tu_device * dev,uint64_t size,uint64_t client_iova,enum tu_bo_alloc_flags flags,uint64_t * iova)111 tu_allocate_userspace_iova(struct tu_device *dev,
112                            uint64_t size,
113                            uint64_t client_iova,
114                            enum tu_bo_alloc_flags flags,
115                            uint64_t *iova)
116 {
117    *iova = 0;
118 
119    if (flags & TU_BO_ALLOC_REPLAYABLE) {
120       if (client_iova) {
121          if (util_vma_heap_alloc_addr(&dev->vma, client_iova, size)) {
122             *iova = client_iova;
123          } else {
124             return VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS;
125          }
126       } else {
127          /* We have to separate replayable IOVAs from ordinary one in order to
128           * for them not to clash. The easiest way to do this is to allocate
129           * them from the other end of the address space.
130           */
131          dev->vma.alloc_high = true;
132          *iova = util_vma_heap_alloc(&dev->vma, size, 0x1000);
133       }
134    } else {
135       dev->vma.alloc_high = false;
136       *iova = util_vma_heap_alloc(&dev->vma, size, 0x1000);
137    }
138 
139    if (!*iova)
140       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
141 
142    return VK_SUCCESS;
143 }
144 
145 int
tu_drm_export_dmabuf(struct tu_device * dev,struct tu_bo * bo)146 tu_drm_export_dmabuf(struct tu_device *dev, struct tu_bo *bo)
147 {
148    int prime_fd;
149    int ret = drmPrimeHandleToFD(dev->fd, bo->gem_handle,
150                                 DRM_CLOEXEC | DRM_RDWR, &prime_fd);
151 
152    return ret == 0 ? prime_fd : -1;
153 }
154 
155 void
tu_drm_bo_finish(struct tu_device * dev,struct tu_bo * bo)156 tu_drm_bo_finish(struct tu_device *dev, struct tu_bo *bo)
157 {
158    assert(bo->gem_handle);
159 
160    u_rwlock_rdlock(&dev->dma_bo_lock);
161 
162    if (!p_atomic_dec_zero(&bo->refcnt)) {
163       u_rwlock_rdunlock(&dev->dma_bo_lock);
164       return;
165    }
166 
167    if (bo->map)
168       munmap(bo->map, bo->size);
169 
170    tu_debug_bos_del(dev, bo);
171 
172    mtx_lock(&dev->bo_mutex);
173    dev->bo_count--;
174    dev->bo_list[bo->bo_list_idx] = dev->bo_list[dev->bo_count];
175 
176    struct tu_bo* exchanging_bo = tu_device_lookup_bo(dev, dev->bo_list[bo->bo_list_idx].handle);
177    exchanging_bo->bo_list_idx = bo->bo_list_idx;
178 
179    if (bo->implicit_sync)
180       dev->implicit_sync_bo_count--;
181 
182    mtx_unlock(&dev->bo_mutex);
183 
184    if (dev->physical_device->has_set_iova) {
185       mtx_lock(&dev->vma_mutex);
186       struct tu_zombie_vma *vma = (struct tu_zombie_vma *)
187             u_vector_add(&dev->zombie_vmas);
188       vma->gem_handle = bo->gem_handle;
189 #ifdef TU_HAS_VIRTIO
190       vma->res_id = bo->res_id;
191 #endif
192       vma->iova = bo->iova;
193       vma->size = bo->size;
194       vma->fence = p_atomic_read(&dev->queues[0]->fence);
195 
196       /* Must be cleared under the VMA mutex, or another thread could race to
197        * reap the VMA, closing the BO and letting a new GEM allocation produce
198        * this handle again.
199        */
200       memset(bo, 0, sizeof(*bo));
201       mtx_unlock(&dev->vma_mutex);
202    } else {
203       /* Our BO structs are stored in a sparse array in the physical device,
204        * so we don't want to free the BO pointer, instead we want to reset it
205        * to 0, to signal that array entry as being free.
206        */
207       uint32_t gem_handle = bo->gem_handle;
208       memset(bo, 0, sizeof(*bo));
209 
210       /* Note that virtgpu GEM_CLOSE path is a bit different, but it does
211        * not use the !has_set_iova path so we can ignore that
212        */
213       struct drm_gem_close req = {
214          .handle = gem_handle,
215       };
216 
217       drmIoctl(dev->fd, DRM_IOCTL_GEM_CLOSE, &req);
218    }
219 
220    u_rwlock_rdunlock(&dev->dma_bo_lock);
221 }
222 
223 uint32_t
tu_syncobj_from_vk_sync(struct vk_sync * sync)224 tu_syncobj_from_vk_sync(struct vk_sync *sync)
225 {
226    uint32_t syncobj = -1;
227    if (vk_sync_is_tu_timeline_sync(sync)) {
228       syncobj = to_tu_timeline_sync(sync)->syncobj;
229    } else if (vk_sync_type_is_drm_syncobj(sync->type)) {
230       syncobj = vk_sync_as_drm_syncobj(sync)->syncobj;
231    }
232 
233    assert(syncobj != -1);
234 
235    return syncobj;
236 }
237 
238 static VkResult
tu_timeline_sync_init(struct vk_device * vk_device,struct vk_sync * vk_sync,uint64_t initial_value)239 tu_timeline_sync_init(struct vk_device *vk_device,
240                       struct vk_sync *vk_sync,
241                       uint64_t initial_value)
242 {
243    struct tu_device *device = container_of(vk_device, struct tu_device, vk);
244    struct tu_timeline_sync *sync = to_tu_timeline_sync(vk_sync);
245    uint32_t flags = 0;
246 
247    assert(device->fd >= 0);
248 
249    int err = drmSyncobjCreate(device->fd, flags, &sync->syncobj);
250 
251    if (err < 0) {
252         return vk_error(device, VK_ERROR_DEVICE_LOST);
253    }
254 
255    sync->state = initial_value ? TU_TIMELINE_SYNC_STATE_SIGNALED :
256                                     TU_TIMELINE_SYNC_STATE_RESET;
257 
258    return VK_SUCCESS;
259 }
260 
261 static void
tu_timeline_sync_finish(struct vk_device * vk_device,struct vk_sync * vk_sync)262 tu_timeline_sync_finish(struct vk_device *vk_device,
263                    struct vk_sync *vk_sync)
264 {
265    struct tu_device *dev = container_of(vk_device, struct tu_device, vk);
266    struct tu_timeline_sync *sync = to_tu_timeline_sync(vk_sync);
267 
268    assert(dev->fd >= 0);
269    ASSERTED int err = drmSyncobjDestroy(dev->fd, sync->syncobj);
270    assert(err == 0);
271 }
272 
273 static VkResult
tu_timeline_sync_reset(struct vk_device * vk_device,struct vk_sync * vk_sync)274 tu_timeline_sync_reset(struct vk_device *vk_device,
275                   struct vk_sync *vk_sync)
276 {
277    struct tu_device *dev = container_of(vk_device, struct tu_device, vk);
278    struct tu_timeline_sync *sync = to_tu_timeline_sync(vk_sync);
279 
280    int err = drmSyncobjReset(dev->fd, &sync->syncobj, 1);
281    if (err) {
282       return vk_errorf(dev, VK_ERROR_UNKNOWN,
283                        "DRM_IOCTL_SYNCOBJ_RESET failed: %m");
284    } else {
285        sync->state = TU_TIMELINE_SYNC_STATE_RESET;
286    }
287 
288    return VK_SUCCESS;
289 }
290 
291 static VkResult
drm_syncobj_wait(struct tu_device * device,uint32_t * handles,uint32_t count_handles,uint64_t timeout_nsec,bool wait_all)292 drm_syncobj_wait(struct tu_device *device,
293                  uint32_t *handles, uint32_t count_handles,
294                  uint64_t timeout_nsec, bool wait_all)
295 {
296    uint32_t syncobj_wait_flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT;
297    if (wait_all) syncobj_wait_flags |= DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL;
298 
299    /* syncobj absolute timeouts are signed.  clamp OS_TIMEOUT_INFINITE down. */
300    timeout_nsec = MIN2(timeout_nsec, (uint64_t)INT64_MAX);
301 
302    int err = drmSyncobjWait(device->fd, handles,
303                             count_handles, timeout_nsec,
304                             syncobj_wait_flags,
305                             NULL /* first_signaled */);
306    if (err && errno == ETIME) {
307       return VK_TIMEOUT;
308    } else if (err) {
309       return vk_errorf(device, VK_ERROR_UNKNOWN,
310                        "DRM_IOCTL_SYNCOBJ_WAIT failed: %m");
311    }
312 
313    return VK_SUCCESS;
314 }
315 
316 /* Based on anv_bo_sync_wait */
317 static VkResult
tu_timeline_sync_wait(struct vk_device * vk_device,uint32_t wait_count,const struct vk_sync_wait * waits,enum vk_sync_wait_flags wait_flags,uint64_t abs_timeout_ns)318 tu_timeline_sync_wait(struct vk_device *vk_device,
319                  uint32_t wait_count,
320                  const struct vk_sync_wait *waits,
321                  enum vk_sync_wait_flags wait_flags,
322                  uint64_t abs_timeout_ns)
323 {
324    struct tu_device *dev = container_of(vk_device, struct tu_device, vk);
325    bool wait_all = !(wait_flags & VK_SYNC_WAIT_ANY);
326 
327    uint32_t handles[wait_count];
328    uint32_t submit_count;
329    VkResult ret = VK_SUCCESS;
330    uint32_t pending = wait_count;
331    struct tu_timeline_sync *submitted_syncs[wait_count];
332 
333    while (pending) {
334       pending = 0;
335       submit_count = 0;
336 
337       for (unsigned i = 0; i < wait_count; ++i) {
338          struct tu_timeline_sync *sync = to_tu_timeline_sync(waits[i].sync);
339 
340          if (sync->state == TU_TIMELINE_SYNC_STATE_RESET) {
341             assert(!(wait_flags & VK_SYNC_WAIT_PENDING));
342             pending++;
343          } else if (sync->state == TU_TIMELINE_SYNC_STATE_SIGNALED) {
344             if (wait_flags & VK_SYNC_WAIT_ANY)
345                return VK_SUCCESS;
346          } else if (sync->state == TU_TIMELINE_SYNC_STATE_SUBMITTED) {
347             if (!(wait_flags & VK_SYNC_WAIT_PENDING)) {
348                handles[submit_count] = sync->syncobj;
349                submitted_syncs[submit_count++] = sync;
350             }
351          }
352       }
353 
354       if (submit_count > 0) {
355          do {
356             ret = drm_syncobj_wait(dev, handles, submit_count, abs_timeout_ns, wait_all);
357          } while (ret == VK_TIMEOUT && os_time_get_nano() < abs_timeout_ns);
358 
359          if (ret == VK_SUCCESS) {
360             for (unsigned i = 0; i < submit_count; ++i) {
361                struct tu_timeline_sync *sync = submitted_syncs[i];
362                sync->state = TU_TIMELINE_SYNC_STATE_SIGNALED;
363             }
364          } else {
365             /* return error covering timeout */
366             return ret;
367          }
368       } else if (pending > 0) {
369          /* If we've hit this then someone decided to vkWaitForFences before
370           * they've actually submitted any of them to a queue.  This is a
371           * fairly pessimal case, so it's ok to lock here and use a standard
372           * pthreads condition variable.
373           */
374          pthread_mutex_lock(&dev->submit_mutex);
375 
376          /* It's possible that some of the fences have changed state since the
377           * last time we checked.  Now that we have the lock, check for
378           * pending fences again and don't wait if it's changed.
379           */
380          uint32_t now_pending = 0;
381          for (uint32_t i = 0; i < wait_count; i++) {
382             struct tu_timeline_sync *sync = to_tu_timeline_sync(waits[i].sync);
383             if (sync->state == TU_TIMELINE_SYNC_STATE_RESET)
384                now_pending++;
385          }
386          assert(now_pending <= pending);
387 
388          if (now_pending == pending) {
389             struct timespec abstime = {
390                .tv_sec = abs_timeout_ns / NSEC_PER_SEC,
391                .tv_nsec = abs_timeout_ns % NSEC_PER_SEC,
392             };
393 
394             ASSERTED int ret;
395             ret = pthread_cond_timedwait(&dev->timeline_cond,
396                                          &dev->submit_mutex, &abstime);
397             assert(ret != EINVAL);
398             if (os_time_get_nano() >= abs_timeout_ns) {
399                pthread_mutex_unlock(&dev->submit_mutex);
400                return VK_TIMEOUT;
401             }
402          }
403 
404          pthread_mutex_unlock(&dev->submit_mutex);
405       }
406    }
407 
408    return ret;
409 }
410 
411 const struct vk_sync_type tu_timeline_sync_type = {
412    .size = sizeof(struct tu_timeline_sync),
413    .features = (enum vk_sync_features)(
414       VK_SYNC_FEATURE_BINARY | VK_SYNC_FEATURE_GPU_WAIT |
415       VK_SYNC_FEATURE_GPU_MULTI_WAIT | VK_SYNC_FEATURE_CPU_WAIT |
416       VK_SYNC_FEATURE_CPU_RESET | VK_SYNC_FEATURE_WAIT_ANY |
417       VK_SYNC_FEATURE_WAIT_PENDING),
418    .init = tu_timeline_sync_init,
419    .finish = tu_timeline_sync_finish,
420    .reset = tu_timeline_sync_reset,
421    .wait_many = tu_timeline_sync_wait,
422 };
423