• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 #ifndef VK_DEVICE_H
24 #define VK_DEVICE_H
25 
26 #include "rmv/vk_rmv_common.h"
27 #include "vk_dispatch_table.h"
28 #include "vk_extensions.h"
29 #include "vk_object.h"
30 #include "vk_physical_device_features.h"
31 
32 #include "util/list.h"
33 #include "util/simple_mtx.h"
34 #include "util/u_atomic.h"
35 
36 #ifdef __cplusplus
37 extern "C" {
38 #endif
39 
40 struct vk_acceleration_structure_build_ops;
41 struct vk_command_buffer_ops;
42 struct vk_device_shader_ops;
43 struct vk_sync;
44 
45 enum vk_queue_submit_mode {
46    /** Submits happen immediately
47     *
48     * `vkQueueSubmit()` and `vkQueueBindSparse()` call
49     * ``vk_queue::driver_submit`` directly for all submits and the last call to
50     * ``vk_queue::driver_submit`` will have completed by the time
51     * `vkQueueSubmit()` or `vkQueueBindSparse()` return.
52     */
53    VK_QUEUE_SUBMIT_MODE_IMMEDIATE,
54 
55    /** Submits may be deferred until a future `vk_queue_flush()`
56     *
57     * Submits are added to the queue and `vk_queue_flush()` is called.
58     * However, any submits with unsatisfied dependencies will be left on the
59     * queue until a future `vk_queue_flush()` call.  This is used for
60     * implementing emulated timeline semaphores without threading.
61     */
62    VK_QUEUE_SUBMIT_MODE_DEFERRED,
63 
64    /** Submits will be added to the queue and handled later by a thread
65     *
66     * This places additional requirements on the vk_sync types used by the
67     * driver:
68     *
69     *    1. All `vk_sync` types which support `VK_SYNC_FEATURE_GPU_WAIT` also
70     *       support `VK_SYNC_FEATURE_WAIT_PENDING` so that the threads can
71     *       sort out when a given submit has all its dependencies resolved.
72     *
73     *    2. All binary `vk_sync` types which support `VK_SYNC_FEATURE_GPU_WAIT`
74     *       also support `VK_SYNC_FEATURE_CPU_RESET` so we can reset
75     *       semaphores after waiting on them.
76     *
77     *    3. All vk_sync types used as permanent payloads of semaphores support
78     *       ``vk_sync_type::move`` so that it can move the pending signal into a
79     *       temporary vk_sync and reset the semaphore.
80     *
81     * This is requied for shared timeline semaphores where we need to handle
82     * wait-before-signal by threading in the driver if we ever see an
83     * unresolve dependency.
84     */
85    VK_QUEUE_SUBMIT_MODE_THREADED,
86 
87    /** Threaded but only if we need it to resolve dependencies
88     *
89     * This imposes all the same requirements on `vk_sync` types as
90     * `VK_QUEUE_SUBMIT_MODE_THREADED`.
91     */
92    VK_QUEUE_SUBMIT_MODE_THREADED_ON_DEMAND,
93 };
94 
95 /** Base struct for VkDevice */
96 struct vk_device {
97    struct vk_object_base base;
98 
99    /** Allocator used to create this device
100     *
101     * This is used as a fall-back for when a NULL pAllocator is passed into a
102     * device-level create function such as vkCreateImage().
103     */
104    VkAllocationCallbacks alloc;
105 
106    /** Pointer to the physical device */
107    struct vk_physical_device *physical;
108 
109    /** Table of enabled extensions */
110    struct vk_device_extension_table enabled_extensions;
111 
112    /** Table of enabled features */
113    struct vk_features enabled_features;
114 
115    /** Device-level dispatch table */
116    struct vk_device_dispatch_table dispatch_table;
117 
118    /** Command dispatch table
119     *
120     * This is used for emulated secondary command buffer support.  To use
121     * emulated (trace/replay) secondary command buffers:
122     *
123     *  1. Provide your "real" command buffer dispatch table here.  Because
124     *     this doesn't get populated by vk_device_init(), the driver will have
125     *     to add the vk_common entrypoints to this table itself.
126     *
127     *  2. Add vk_enqueue_unless_primary_device_entrypoint_table to your device
128     *     level dispatch table.
129     */
130    const struct vk_device_dispatch_table *command_dispatch_table;
131 
132    /** Command buffer vtable when using the common command pool */
133    const struct vk_command_buffer_ops *command_buffer_ops;
134 
135    /** Shader vtable for VK_EXT_shader_object and common pipelines */
136    const struct vk_device_shader_ops *shader_ops;
137 
138    /** Acceleration structure build vtable for common BVH building. */
139    const struct vk_acceleration_structure_build_ops *as_build_ops;
140 
141    /**
142     * Write data to a buffer from the command processor. This is simpler than
143     * setting up a staging buffer and faster for small writes, but is not
144     * meant for larger amounts of data. \p data is owned by the caller and the
145     * driver is expected to write it out directly to the command stream as
146     * part of an immediate write packet.
147     */
148    void (*write_buffer_cp)(VkCommandBuffer cmdbuf, VkDeviceAddress addr,
149                            void *data, uint32_t size);
150 
151    /* Flush data written via write_buffer_cp. Users must use a normal pipeline
152     * barrier in order to read this data, with the appropriate destination
153     * access, but this replaces the source access mask.
154     */
155    void (*flush_buffer_write_cp)(VkCommandBuffer cmdbuf);
156 
157    /* An unaligned dispatch function. This launches a number of threads that
158     * may not be a multiple of the workgroup size, which may result in partial
159     * workgroups.
160     */
161    void (*cmd_dispatch_unaligned)(VkCommandBuffer cmdbuf,
162                                   uint32_t invocations_x,
163                                   uint32_t invocations_y,
164                                   uint32_t invocations_z);
165 
166    /* vkCmdFillBuffer but with a device address. */
167    void (*cmd_fill_buffer_addr)(VkCommandBuffer cmdbuf,
168                                 VkDeviceAddress devAddr,
169                                 VkDeviceSize size,
170                                 uint32_t data);
171 
172    /** Driver provided callback for capturing traces
173     *
174     * Triggers for this callback are:
175     *    - Keyboard input (F12)
176     *    - Creation of a trigger file
177     *    - Reaching the trace frame
178     */
179    VkResult (*capture_trace)(VkQueue queue);
180 
181    uint32_t current_frame;
182    bool trace_hotkey_trigger;
183    simple_mtx_t trace_mtx;
184 
185    /* For VK_EXT_private_data */
186    uint32_t private_data_next_index;
187 
188    struct list_head queues;
189 
190    struct {
191       int lost;
192       bool reported;
193    } _lost;
194 
195    /** Checks the status of this device
196     *
197     * This is expected to return either VK_SUCCESS or VK_ERROR_DEVICE_LOST.
198     * It is called before ``vk_queue::driver_submit`` and after every non-trivial
199     * wait operation to ensure the device is still around.  This gives the
200     * driver a hook to ask the kernel if its device is still valid.  If the
201     * kernel says the device has been lost, it MUST call vk_device_set_lost().
202     *
203     * This function may be called from any thread at any time.
204     */
205    VkResult (*check_status)(struct vk_device *device);
206 
207    /* Get the device timestamp in the VK_TIME_DOMAIN_DEVICE_KHR domain */
208    VkResult (*get_timestamp)(struct vk_device *device, uint64_t *timestamp);
209 
210    /** Host time domain used for timestamp calibration */
211    VkTimeDomainKHR calibrate_time_domain;
212    /** Period of VK_TIME_DOMAIN_DEVICE_KHR */
213    uint64_t device_time_domain_period;
214 
215    /** Creates a vk_sync that wraps a memory object
216     *
217     * This is always a one-shot object so it need not track any additional
218     * state.  Since it's intended for synchronizing between processes using
219     * implicit synchronization mechanisms, no such tracking would be valid
220     * anyway.
221     *
222     * If `signal_memory` is set, the resulting vk_sync will be used to signal
223     * the memory object from a queue ``via vk_queue_submit::signals``.  The common
224     * code guarantees that, by the time vkQueueSubmit() returns, the signal
225     * operation has been submitted to the kernel via the driver's
226     * ``vk_queue::driver_submit`` hook.  This means that any vkQueueSubmit() call
227     * which needs implicit synchronization may block.
228     *
229     * If `signal_memory` is not set, it can be assumed that memory object
230     * already has a signal operation pending from some other process and we
231     * need only wait on it.
232     */
233    VkResult (*create_sync_for_memory)(struct vk_device *device,
234                                       VkDeviceMemory memory,
235                                       bool signal_memory,
236                                       struct vk_sync **sync_out);
237 
238    /* Set by vk_device_set_drm_fd() */
239    int drm_fd;
240 
241    /** Implicit pipeline cache, or NULL */
242    struct vk_pipeline_cache *mem_cache;
243 
244    /** An enum describing how timeline semaphores work */
245    enum vk_device_timeline_mode {
246       /** Timeline semaphores are not supported */
247       VK_DEVICE_TIMELINE_MODE_NONE,
248 
249       /** Timeline semaphores are emulated with vk_timeline
250        *
251        * In this mode, timeline semaphores are emulated using vk_timeline
252        * which is a collection of binary semaphores, one per time point.
253        * These timeline semaphores cannot be shared because the data structure
254        * exists entirely in userspace.  These timelines are virtually
255        * invisible to the driver; all it sees are the binary vk_syncs, one per
256        * time point.
257        *
258        * To handle wait-before-signal, we place all vk_queue_submits in the
259        * queue's submit list in vkQueueSubmit() and call vk_device_flush() at
260        * key points such as the end of vkQueueSubmit() and vkSemaphoreSignal().
261        * This ensures that, as soon as a given submit's dependencies are fully
262        * resolvable, it gets submitted to the driver.
263        */
264       VK_DEVICE_TIMELINE_MODE_EMULATED,
265 
266       /** Timeline semaphores are a kernel-assisted emulation
267        *
268        * In this mode, timeline semaphores are still technically an emulation
269        * in the sense that they don't support wait-before-signal natively.
270        * Instead, all GPU-waitable objects support a CPU wait-for-pending
271        * operation which lets the userspace driver wait until a given event
272        * on the (possibly shared) vk_sync is pending.  The event is "pending"
273        * if a job has been submitted to the kernel (possibly from a different
274        * process) which will signal it.  In vkQueueSubit, we use this wait
275        * mode to detect waits which are not yet pending and, the first time we
276        * do, spawn a thread to manage the queue.  That thread waits for each
277        * submit's waits to all be pending before submitting to the driver
278        * queue.
279        *
280        * We have to be a bit more careful about a few things in this mode.
281        * In particular, we can never assume that any given wait operation is
282        * pending.  For instance, when we go to export a sync file from a
283        * binary semaphore, we need to first wait for it to be pending.  The
284        * spec guarantees that the vast majority of these waits return almost
285        * immediately, but we do need to insert them for correctness.
286        */
287       VK_DEVICE_TIMELINE_MODE_ASSISTED,
288 
289       /** Timeline semaphores are 100% native
290        *
291        * In this mode, wait-before-signal is natively supported by the
292        * underlying timeline implementation.  We can submit-and-forget and
293        * assume that dependencies will get resolved for us by the kernel.
294        * Currently, this isn't supported by any Linux primitives.
295        */
296       VK_DEVICE_TIMELINE_MODE_NATIVE,
297    } timeline_mode;
298 
299    /** Per-device submit mode
300     *
301     * This represents the device-wide submit strategy which may be different
302     * from the per-queue submit mode.  See vk_queue.submit.mode for more
303     * details.
304     */
305    enum vk_queue_submit_mode submit_mode;
306 
307    struct vk_memory_trace_data memory_trace_data;
308 
309    mtx_t swapchain_private_mtx;
310    struct hash_table *swapchain_private;
311    mtx_t swapchain_name_mtx;
312    struct hash_table *swapchain_name;
313 
314    /* For VK_KHR_pipeline_binary */
315    bool disable_internal_cache;
316 };
317 
318 VK_DEFINE_HANDLE_CASTS(vk_device, base, VkDevice,
319                        VK_OBJECT_TYPE_DEVICE);
320 
321 /** Initialize a vk_device
322  *
323  * Along with initializing the data structures in `vk_device`, this function
324  * checks that every extension specified by
325  * ``VkInstanceCreateInfo::ppEnabledExtensionNames`` is actually supported by
326  * the physical device and returns `VK_ERROR_EXTENSION_NOT_PRESENT` if an
327  * unsupported extension is requested.  It also checks all the feature struct
328  * chained into the `pCreateInfo->pNext` chain against the features returned
329  * by `vkGetPhysicalDeviceFeatures2` and returns
330  * `VK_ERROR_FEATURE_NOT_PRESENT` if an unsupported feature is requested.
331  *
332  * :param device:               |out| The device to initialize
333  * :param physical_device:      |in|  The physical device
334  * :param dispatch_table:       |in|  Device-level dispatch table
335  * :param pCreateInfo:          |in|  VkDeviceCreateInfo pointer passed to
336  *                                    `vkCreateDevice()`
337  * :param alloc:                |in|  Allocation callbacks passed to
338  *                                    `vkCreateDevice()`
339  */
340 VkResult MUST_CHECK
341 vk_device_init(struct vk_device *device,
342                struct vk_physical_device *physical_device,
343                const struct vk_device_dispatch_table *dispatch_table,
344                const VkDeviceCreateInfo *pCreateInfo,
345                const VkAllocationCallbacks *alloc);
346 
347 static inline void
vk_device_set_drm_fd(struct vk_device * device,int drm_fd)348 vk_device_set_drm_fd(struct vk_device *device, int drm_fd)
349 {
350    device->drm_fd = drm_fd;
351 }
352 
353 /** Tears down a vk_device
354  *
355  * :param device:       |out| The device to tear down
356  */
357 void
358 vk_device_finish(struct vk_device *device);
359 
360 /** Enables threaded submit on this device
361  *
362  * This doesn't ensure that threaded submit will be used.  It just disables
363  * the deferred submit option for emulated timeline semaphores and forces them
364  * to always use the threaded path.  It also does some checks that the vk_sync
365  * types used by the driver work for threaded submit.
366  *
367  * This must be called before any queues are created.
368  */
369 void vk_device_enable_threaded_submit(struct vk_device *device);
370 
371 static inline bool
vk_device_supports_threaded_submit(const struct vk_device * device)372 vk_device_supports_threaded_submit(const struct vk_device *device)
373 {
374    return device->submit_mode == VK_QUEUE_SUBMIT_MODE_THREADED ||
375           device->submit_mode == VK_QUEUE_SUBMIT_MODE_THREADED_ON_DEMAND;
376 }
377 
378 VkResult vk_device_flush(struct vk_device *device);
379 
380 VkResult PRINTFLIKE(4, 5)
381 _vk_device_set_lost(struct vk_device *device,
382                     const char *file, int line,
383                     const char *msg, ...);
384 
385 #define vk_device_set_lost(device, ...) \
386    _vk_device_set_lost(device, __FILE__, __LINE__, __VA_ARGS__)
387 
388 void _vk_device_report_lost(struct vk_device *device);
389 
390 static inline bool
vk_device_is_lost_no_report(struct vk_device * device)391 vk_device_is_lost_no_report(struct vk_device *device)
392 {
393    return p_atomic_read(&device->_lost.lost) > 0;
394 }
395 
396 static inline bool
vk_device_is_lost(struct vk_device * device)397 vk_device_is_lost(struct vk_device *device)
398 {
399    int lost = vk_device_is_lost_no_report(device);
400    if (unlikely(lost && !device->_lost.reported))
401       _vk_device_report_lost(device);
402    return lost;
403 }
404 
405 static inline VkResult
vk_device_check_status(struct vk_device * device)406 vk_device_check_status(struct vk_device *device)
407 {
408    if (vk_device_is_lost(device))
409       return VK_ERROR_DEVICE_LOST;
410 
411    if (!device->check_status)
412       return VK_SUCCESS;
413 
414    VkResult result = device->check_status(device);
415 
416    assert(result == VK_SUCCESS || result == VK_ERROR_DEVICE_LOST);
417    if (result == VK_ERROR_DEVICE_LOST)
418       assert(vk_device_is_lost_no_report(device));
419 
420    return result;
421 }
422 
423 VkResult
424 vk_device_get_timestamp(struct vk_device *device, VkTimeDomainKHR domain,
425                         uint64_t *timestamp);
426 
427 #ifndef _WIN32
428 
429 uint64_t
430 vk_clock_gettime(clockid_t clock_id);
431 
432 #endif //!_WIN32
433 
434 static inline uint64_t
vk_time_max_deviation(uint64_t begin,uint64_t end,uint64_t max_clock_period)435 vk_time_max_deviation(uint64_t begin, uint64_t end, uint64_t max_clock_period)
436 {
437     /*
438      * The maximum deviation is the sum of the interval over which we
439      * perform the sampling and the maximum period of any sampled
440      * clock. That's because the maximum skew between any two sampled
441      * clock edges is when the sampled clock with the largest period is
442      * sampled at the end of that period but right at the beginning of the
443      * sampling interval and some other clock is sampled right at the
444      * beginning of its sampling period and right at the end of the
445      * sampling interval. Let's assume the GPU has the longest clock
446      * period and that the application is sampling GPU and monotonic:
447      *
448      *                               s                 e
449      *			 w x y z 0 1 2 3 4 5 6 7 8 9 a b c d e f
450      *	Raw              -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-
451      *
452      *                               g
453      *		  0         1         2         3
454      *	GPU       -----_____-----_____-----_____-----_____
455      *
456      *                                                m
457      *					    x y z 0 1 2 3 4 5 6 7 8 9 a b c
458      *	Monotonic                           -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-
459      *
460      *	Interval                     <----------------->
461      *	Deviation           <-------------------------->
462      *
463      *		s  = read(raw)       2
464      *		g  = read(GPU)       1
465      *		m  = read(monotonic) 2
466      *		e  = read(raw)       b
467      *
468      * We round the sample interval up by one tick to cover sampling error
469      * in the interval clock
470      */
471 
472    uint64_t sample_interval = end - begin + 1;
473 
474    return sample_interval + max_clock_period;
475 }
476 
477 PFN_vkVoidFunction
478 vk_device_get_proc_addr(const struct vk_device *device,
479                         const char *name);
480 
481 #ifdef __cplusplus
482 }
483 #endif
484 
485 #endif /* VK_DEVICE_H */
486