1 /*
2 * Copyright © 2023 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "xe/anv_batch_chain.h"
25
26 #include "anv_private.h"
27 #include "anv_measure.h"
28 #include "common/intel_bind_timeline.h"
29
30 #include "drm-uapi/xe_drm.h"
31
32 VkResult
xe_execute_simple_batch(struct anv_queue * queue,struct anv_bo * batch_bo,uint32_t batch_bo_size,bool is_companion_rcs_batch)33 xe_execute_simple_batch(struct anv_queue *queue,
34 struct anv_bo *batch_bo,
35 uint32_t batch_bo_size,
36 bool is_companion_rcs_batch)
37 {
38 struct anv_device *device = queue->device;
39 uint32_t exec_queue_id = is_companion_rcs_batch ?
40 queue->companion_rcs_id :
41 queue->exec_queue_id;
42 struct drm_syncobj_create syncobj_create = {};
43 struct drm_syncobj_destroy syncobj_destroy = {};
44 struct drm_xe_sync syncs[2] = {};
45 VkResult result = VK_SUCCESS;
46
47 if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_CREATE, &syncobj_create))
48 return vk_errorf(device, VK_ERROR_UNKNOWN, "Unable to create sync obj");
49
50 syncs[0].type = DRM_XE_SYNC_TYPE_SYNCOBJ;
51 syncs[0].flags = DRM_XE_SYNC_FLAG_SIGNAL;
52 syncs[0].handle = syncobj_create.handle;
53
54 /* vm bind sync */
55 syncs[1].type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ;
56 syncs[1].handle = intel_bind_timeline_get_syncobj(&device->bind_timeline);
57 syncs[1].timeline_value = intel_bind_timeline_get_last_point(&device->bind_timeline);
58
59 struct drm_xe_exec exec = {
60 .exec_queue_id = exec_queue_id,
61 .num_batch_buffer = 1,
62 .address = batch_bo->offset,
63 .num_syncs = ARRAY_SIZE(syncs),
64 .syncs = (uintptr_t)syncs,
65 };
66
67 if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec)) {
68 result = vk_device_set_lost(&device->vk, "XE_EXEC failed: %m");
69 goto exec_error;
70 }
71
72 struct drm_syncobj_wait wait = {
73 .handles = (uintptr_t)&syncobj_create.handle,
74 .timeout_nsec = INT64_MAX,
75 .count_handles = 1,
76 };
77 if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_WAIT, &wait))
78 result = vk_device_set_lost(&device->vk, "DRM_IOCTL_SYNCOBJ_WAIT failed: %m");
79
80 exec_error:
81 syncobj_destroy.handle = syncobj_create.handle;
82 intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_DESTROY, &syncobj_destroy);
83
84 return result;
85 }
86
87 #define TYPE_SIGNAL true
88 #define TYPE_WAIT false
89
90 static void
xe_exec_fill_sync(struct drm_xe_sync * xe_sync,struct vk_sync * vk_sync,uint64_t value,bool signal)91 xe_exec_fill_sync(struct drm_xe_sync *xe_sync, struct vk_sync *vk_sync,
92 uint64_t value, bool signal)
93 {
94 if (unlikely(!vk_sync_type_is_drm_syncobj(vk_sync->type))) {
95 unreachable("Unsupported sync type");
96 return;
97 }
98
99 const struct vk_drm_syncobj *syncobj = vk_sync_as_drm_syncobj(vk_sync);
100 xe_sync->handle = syncobj->syncobj;
101
102 if (value) {
103 xe_sync->type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ;
104 xe_sync->timeline_value = value;
105 } else {
106 xe_sync->type = DRM_XE_SYNC_TYPE_SYNCOBJ;
107 }
108
109 if (signal)
110 xe_sync->flags = DRM_XE_SYNC_FLAG_SIGNAL;
111 }
112
113 static VkResult
xe_exec_process_syncs(struct anv_queue * queue,uint32_t wait_count,const struct vk_sync_wait * waits,uint32_t signal_count,const struct vk_sync_signal * signals,uint32_t extra_sync_count,const struct drm_xe_sync * extra_syncs,struct anv_utrace_submit * utrace_submit,bool is_companion_rcs_queue,struct drm_xe_sync ** ret,uint32_t * ret_count)114 xe_exec_process_syncs(struct anv_queue *queue,
115 uint32_t wait_count, const struct vk_sync_wait *waits,
116 uint32_t signal_count, const struct vk_sync_signal *signals,
117 uint32_t extra_sync_count, const struct drm_xe_sync *extra_syncs,
118 struct anv_utrace_submit *utrace_submit,
119 bool is_companion_rcs_queue,
120 struct drm_xe_sync **ret, uint32_t *ret_count)
121 {
122 struct anv_device *device = queue->device;
123 /* Signal the utrace sync only if it doesn't have a batch. Otherwise the
124 * it's the utrace batch that should signal its own sync.
125 */
126 const bool has_utrace_sync = utrace_submit &&
127 util_dynarray_num_elements(&utrace_submit->batch_bos, struct anv_bo *) == 0;
128 const uint32_t num_syncs = wait_count + signal_count + extra_sync_count +
129 (has_utrace_sync ? 1 : 0) +
130 ((queue->sync && !is_companion_rcs_queue) ? 1 : 0) +
131 1 /* vm bind sync */;
132 struct drm_xe_sync *xe_syncs = vk_zalloc(&device->vk.alloc,
133 sizeof(*xe_syncs) * num_syncs, 8,
134 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
135 struct drm_xe_sync *xe_sync;
136
137 if (!xe_syncs)
138 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
139
140 uint32_t count = 0;
141
142 if (has_utrace_sync) {
143 xe_sync = &xe_syncs[count++];
144 xe_exec_fill_sync(xe_sync, utrace_submit->sync, 0, TYPE_SIGNAL);
145 }
146
147 for (uint32_t i = 0; i < wait_count; i++) {
148 const struct vk_sync_wait *vk_wait = &waits[i];
149
150 xe_sync = &xe_syncs[count++];
151 xe_exec_fill_sync(xe_sync, vk_wait->sync, vk_wait->wait_value,
152 TYPE_WAIT);
153 }
154
155 for (uint32_t i = 0; i < signal_count; i++) {
156 const struct vk_sync_signal *vk_signal = &signals[i];
157
158 xe_sync = &xe_syncs[count++];
159 xe_exec_fill_sync(xe_sync, vk_signal->sync, vk_signal->signal_value,
160 TYPE_SIGNAL);
161 }
162
163 for (uint32_t i = 0; i < extra_sync_count; i++)
164 xe_syncs[count++] = extra_syncs[i];
165
166 if (queue->sync && !is_companion_rcs_queue) {
167 xe_sync = &xe_syncs[count++];
168 xe_exec_fill_sync(xe_sync, queue->sync, 0,
169 TYPE_SIGNAL);
170 }
171
172 /* vm bind sync */
173 xe_sync = &xe_syncs[count++];
174 xe_sync->handle = intel_bind_timeline_get_syncobj(&device->bind_timeline);
175 xe_sync->type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ;
176 xe_sync->timeline_value = intel_bind_timeline_get_last_point(&device->bind_timeline);
177
178 assert(count == num_syncs);
179 *ret = xe_syncs;
180 *ret_count = num_syncs;
181 return VK_SUCCESS;
182 }
183
184 static void
xe_exec_print_debug(struct anv_queue * queue,uint32_t cmd_buffer_count,struct anv_cmd_buffer ** cmd_buffers,struct anv_query_pool * perf_query_pool,uint32_t perf_query_pass,struct drm_xe_exec * exec)185 xe_exec_print_debug(struct anv_queue *queue, uint32_t cmd_buffer_count,
186 struct anv_cmd_buffer **cmd_buffers, struct anv_query_pool *perf_query_pool,
187 uint32_t perf_query_pass, struct drm_xe_exec *exec)
188 {
189 if (INTEL_DEBUG(DEBUG_SUBMIT))
190 fprintf(stderr, "Batch offset=0x%016"PRIx64" on queue %u\n",
191 (uint64_t)exec->address, queue->vk.index_in_family);
192
193 anv_cmd_buffer_exec_batch_debug(queue, cmd_buffer_count, cmd_buffers,
194 perf_query_pool, perf_query_pass);
195 }
196
197 VkResult
xe_execute_trtt_batch(struct anv_sparse_submission * submit,struct anv_trtt_batch_bo * trtt_bbo)198 xe_execute_trtt_batch(struct anv_sparse_submission *submit,
199 struct anv_trtt_batch_bo *trtt_bbo)
200 {
201 struct anv_queue *queue = submit->queue;
202 struct anv_device *device = queue->device;
203 struct anv_trtt *trtt = &device->trtt;
204 VkResult result;
205
206 struct drm_xe_sync extra_sync = {
207 .type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ,
208 .flags = DRM_XE_SYNC_FLAG_SIGNAL,
209 .handle = trtt->timeline_handle,
210 .timeline_value = trtt_bbo->timeline_val,
211 };
212
213 struct drm_xe_sync *xe_syncs = NULL;
214 uint32_t xe_syncs_count = 0;
215 result = xe_exec_process_syncs(queue, submit->wait_count, submit->waits,
216 submit->signal_count, submit->signals,
217 1, &extra_sync,
218 NULL, /* utrace_submit */
219 false, /* is_companion_rcs_queue */
220 &xe_syncs, &xe_syncs_count);
221 if (result != VK_SUCCESS)
222 return result;
223
224 struct drm_xe_exec exec = {
225 .exec_queue_id = queue->exec_queue_id,
226 .num_syncs = xe_syncs_count,
227 .syncs = (uintptr_t)xe_syncs,
228 .address = trtt_bbo->bo->offset,
229 .num_batch_buffer = 1,
230 };
231
232 if (!device->info->no_hw) {
233 if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec))
234 return vk_device_set_lost(&device->vk, "XE_EXEC failed: %m");
235 }
236
237 if (queue->sync) {
238 result = vk_sync_wait(&device->vk, queue->sync, 0,
239 VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
240 if (result != VK_SUCCESS)
241 return vk_queue_set_lost(&queue->vk, "trtt sync wait failed");
242 }
243
244 return VK_SUCCESS;
245 }
246
247 VkResult
xe_queue_exec_utrace_locked(struct anv_queue * queue,struct anv_utrace_submit * utrace_submit)248 xe_queue_exec_utrace_locked(struct anv_queue *queue,
249 struct anv_utrace_submit *utrace_submit)
250 {
251 struct anv_device *device = queue->device;
252 struct drm_xe_sync xe_syncs[2] = {};
253
254 xe_exec_fill_sync(&xe_syncs[0], utrace_submit->sync, 0, TYPE_SIGNAL);
255
256 xe_syncs[1].type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ;
257 xe_syncs[1].handle = intel_bind_timeline_get_syncobj(&device->bind_timeline);
258 xe_syncs[1].timeline_value = intel_bind_timeline_get_last_point(&device->bind_timeline);
259
260 #ifdef SUPPORT_INTEL_INTEGRATED_GPUS
261 if (device->physical->memory.need_flush &&
262 anv_bo_needs_host_cache_flush(device->utrace_bo_pool.bo_alloc_flags)) {
263 util_dynarray_foreach(&utrace_submit->batch_bos, struct anv_bo *, bo)
264 intel_flush_range((*bo)->map, (*bo)->size);
265 }
266 #endif
267
268 struct anv_bo *batch_bo =
269 *util_dynarray_element(&utrace_submit->batch_bos, struct anv_bo *, 0);
270 struct drm_xe_exec exec = {
271 .exec_queue_id = queue->exec_queue_id,
272 .num_batch_buffer = 1,
273 .syncs = (uintptr_t)xe_syncs,
274 .num_syncs = ARRAY_SIZE(xe_syncs),
275 .address = batch_bo->offset,
276 };
277 if (likely(!device->info->no_hw)) {
278 if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec))
279 return vk_device_set_lost(&device->vk, "anv_xe_queue_exec_locked failed: %m");
280 }
281
282 return VK_SUCCESS;
283 }
284
285 static VkResult
xe_companion_rcs_queue_exec_locked(struct anv_queue * queue,struct anv_cmd_buffer * companion_rcs_cmd_buffer,uint32_t wait_count,const struct vk_sync_wait * waits)286 xe_companion_rcs_queue_exec_locked(struct anv_queue *queue,
287 struct anv_cmd_buffer *companion_rcs_cmd_buffer,
288 uint32_t wait_count,
289 const struct vk_sync_wait *waits)
290 {
291 struct anv_device *device = queue->device;
292 VkResult result;
293
294 struct vk_sync_signal companion_sync = {
295 .sync = queue->companion_sync,
296 };
297 struct drm_xe_sync *xe_syncs = NULL;
298 uint32_t xe_syncs_count = 0;
299 result = xe_exec_process_syncs(queue,
300 wait_count, waits,
301 1, &companion_sync,
302 0, NULL, /* extra_syncs */
303 NULL /* utrace_submit */,
304 true /* is_companion_rcs_queue */,
305 &xe_syncs,
306 &xe_syncs_count);
307 if (result != VK_SUCCESS)
308 return result;
309
310 struct drm_xe_exec exec = {
311 .exec_queue_id = queue->companion_rcs_id,
312 .num_batch_buffer = 1,
313 .syncs = (uintptr_t)xe_syncs,
314 .num_syncs = xe_syncs_count,
315 };
316
317 struct anv_batch_bo *batch_bo =
318 list_first_entry(&companion_rcs_cmd_buffer->batch_bos,
319 struct anv_batch_bo, link);
320 exec.address = batch_bo->bo->offset;
321
322 anv_measure_submit(companion_rcs_cmd_buffer);
323 xe_exec_print_debug(queue, 1, &companion_rcs_cmd_buffer, NULL, 0, &exec);
324
325 if (!device->info->no_hw) {
326 if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec))
327 result = vk_device_set_lost(&device->vk, "anv_xe_queue_exec_locked failed: %m");
328 }
329 vk_free(&device->vk.alloc, xe_syncs);
330
331 return result;
332 }
333
334 VkResult
xe_queue_exec_locked(struct anv_queue * queue,uint32_t wait_count,const struct vk_sync_wait * waits,uint32_t cmd_buffer_count,struct anv_cmd_buffer ** cmd_buffers,uint32_t signal_count,const struct vk_sync_signal * signals,struct anv_query_pool * perf_query_pool,uint32_t perf_query_pass,struct anv_utrace_submit * utrace_submit)335 xe_queue_exec_locked(struct anv_queue *queue,
336 uint32_t wait_count,
337 const struct vk_sync_wait *waits,
338 uint32_t cmd_buffer_count,
339 struct anv_cmd_buffer **cmd_buffers,
340 uint32_t signal_count,
341 const struct vk_sync_signal *signals,
342 struct anv_query_pool *perf_query_pool,
343 uint32_t perf_query_pass,
344 struct anv_utrace_submit *utrace_submit)
345 {
346 struct anv_device *device = queue->device;
347 VkResult result;
348
349 struct drm_xe_sync *xe_syncs = NULL;
350 uint32_t xe_syncs_count = 0;
351 result = xe_exec_process_syncs(queue, wait_count, waits,
352 signal_count, signals,
353 0, NULL, /* extra_syncs */
354 utrace_submit,
355 false, /* is_companion_rcs_queue */
356 &xe_syncs, &xe_syncs_count);
357 if (result != VK_SUCCESS)
358 return result;
359
360 /* If we have no batch for utrace, just forget about it now. */
361 if (utrace_submit &&
362 util_dynarray_num_elements(&utrace_submit->batch_bos,
363 struct anv_bo *) == 0)
364 utrace_submit = NULL;
365
366 struct drm_xe_exec exec = {
367 .exec_queue_id = queue->exec_queue_id,
368 .num_batch_buffer = 1,
369 .syncs = (uintptr_t)xe_syncs,
370 .num_syncs = xe_syncs_count,
371 };
372
373 if (cmd_buffer_count) {
374 if (unlikely(device->physical->measure_device.config)) {
375 for (uint32_t i = 0; i < cmd_buffer_count; i++)
376 anv_measure_submit(cmd_buffers[i]);
377 }
378
379 anv_cmd_buffer_chain_command_buffers(cmd_buffers, cmd_buffer_count);
380
381 #ifdef SUPPORT_INTEL_INTEGRATED_GPUS
382 if (device->physical->memory.need_flush &&
383 anv_bo_needs_host_cache_flush(device->batch_bo_pool.bo_alloc_flags))
384 anv_cmd_buffer_clflush(cmd_buffers, cmd_buffer_count);
385 #endif
386
387 struct anv_cmd_buffer *first_cmd_buffer = cmd_buffers[0];
388 struct anv_batch_bo *first_batch_bo = list_first_entry(&first_cmd_buffer->batch_bos,
389 struct anv_batch_bo, link);
390 exec.address = first_batch_bo->bo->offset;
391 } else {
392 exec.address = device->trivial_batch_bo->offset;
393 }
394
395 xe_exec_print_debug(queue, cmd_buffer_count, cmd_buffers, perf_query_pool,
396 perf_query_pass, &exec);
397
398 /* TODO: add perfetto stuff when Xe supports it */
399
400 if (!device->info->no_hw) {
401 if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec))
402 result = vk_device_set_lost(&device->vk, "anv_xe_queue_exec_locked failed: %m");
403 }
404 vk_free(&device->vk.alloc, xe_syncs);
405
406 if (cmd_buffer_count != 0 && cmd_buffers[0]->companion_rcs_cmd_buffer) {
407 /* not allowed to chain cmd_buffers with companion_rcs_cmd_buffer */
408 assert(cmd_buffer_count == 1);
409 result = xe_companion_rcs_queue_exec_locked(queue,
410 cmd_buffers[0]->companion_rcs_cmd_buffer,
411 wait_count, waits);
412 }
413
414 result = anv_queue_post_submit(queue, result);
415
416 if (result == VK_SUCCESS && utrace_submit)
417 result = xe_queue_exec_utrace_locked(queue, utrace_submit);
418
419 return result;
420 }
421