1 //
2 // Copyright 2021 The ANGLE Project Authors. All rights reserved.
3 // Use of this source code is governed by a BSD-style license that can be
4 // found in the LICENSE file.
5 //
6 // CLCommandQueueVk.cpp: Implements the class methods for CLCommandQueueVk.
7
8 #include "common/PackedCLEnums_autogen.h"
9 #include "common/system_utils.h"
10
11 #include "libANGLE/renderer/vulkan/CLCommandQueueVk.h"
12 #include "libANGLE/renderer/vulkan/CLContextVk.h"
13 #include "libANGLE/renderer/vulkan/CLDeviceVk.h"
14 #include "libANGLE/renderer/vulkan/CLEventVk.h"
15 #include "libANGLE/renderer/vulkan/CLKernelVk.h"
16 #include "libANGLE/renderer/vulkan/CLMemoryVk.h"
17 #include "libANGLE/renderer/vulkan/CLProgramVk.h"
18 #include "libANGLE/renderer/vulkan/CLSamplerVk.h"
19 #include "libANGLE/renderer/vulkan/cl_types.h"
20 #include "libANGLE/renderer/vulkan/clspv_utils.h"
21 #include "libANGLE/renderer/vulkan/vk_cache_utils.h"
22 #include "libANGLE/renderer/vulkan/vk_cl_utils.h"
23 #include "libANGLE/renderer/vulkan/vk_helpers.h"
24 #include "libANGLE/renderer/vulkan/vk_renderer.h"
25 #include "libANGLE/renderer/vulkan/vk_wrapper.h"
26
27 #include "libANGLE/renderer/serial_utils.h"
28
29 #include "libANGLE/CLBuffer.h"
30 #include "libANGLE/CLCommandQueue.h"
31 #include "libANGLE/CLContext.h"
32 #include "libANGLE/CLEvent.h"
33 #include "libANGLE/CLImage.h"
34 #include "libANGLE/CLKernel.h"
35 #include "libANGLE/CLSampler.h"
36 #include "libANGLE/Error.h"
37 #include "libANGLE/cl_types.h"
38 #include "libANGLE/cl_utils.h"
39
40 #include "spirv/unified1/NonSemanticClspvReflection.h"
41 #include "vulkan/vulkan_core.h"
42
43 #include <chrono>
44
45 namespace rx
46 {
47
48 namespace
49 {
50 static constexpr size_t kTimeoutInMS = 10000;
51 static constexpr size_t kSleepInMS = 500;
52 static constexpr size_t kTimeoutCheckIterations = kTimeoutInMS / kSleepInMS;
53
SetEventsWithQueueSerialToState(const cl::EventPtrs & eventList,const QueueSerial & queueSerial,cl::ExecutionStatus state)54 angle::Result SetEventsWithQueueSerialToState(const cl::EventPtrs &eventList,
55 const QueueSerial &queueSerial,
56 cl::ExecutionStatus state)
57 {
58
59 ASSERT(state < cl::ExecutionStatus::EnumCount);
60
61 for (cl::EventPtr event : eventList)
62 {
63 CLEventVk *eventVk = &event->getImpl<CLEventVk>();
64 if (!eventVk->isUserEvent() && eventVk->usedByCommandBuffer(queueSerial))
65 {
66 ANGLE_TRY(eventVk->setStatusAndExecuteCallback(cl::ToCLenum(state)));
67 }
68 }
69 return angle::Result::Continue;
70 }
71
DispatchWorkThread(CLCommandQueueVk * commandQueue)72 DispatchWorkThread::DispatchWorkThread(CLCommandQueueVk *commandQueue)
73 : mCommandQueue(commandQueue),
74 mIsTerminating(false),
75 mQueueSerials(kFixedQueueLimit),
76 mQueueSerialIndex(kInvalidQueueSerialIndex)
77 {}
78
~DispatchWorkThread()79 DispatchWorkThread::~DispatchWorkThread()
80 {
81 ASSERT(mIsTerminating);
82 }
83
init()84 angle::Result DispatchWorkThread::init()
85 {
86 mQueueSerialIndex = mCommandQueue->getQueueSerialIndex();
87 ASSERT(mQueueSerialIndex != kInvalidQueueSerialIndex);
88
89 mWorkerThread = std::thread(&DispatchWorkThread::finishLoop, this);
90
91 return angle::Result::Continue;
92 }
93
terminate()94 void DispatchWorkThread::terminate()
95 {
96 // Terminate the background thread
97 {
98 std::unique_lock<std::mutex> lock(mThreadMutex);
99 mIsTerminating = true;
100 }
101 mHasWorkSubmitted.notify_all();
102 if (mWorkerThread.joinable())
103 {
104 mWorkerThread.join();
105 }
106 }
107
notify(QueueSerial queueSerial)108 angle::Result DispatchWorkThread::notify(QueueSerial queueSerial)
109 {
110 ASSERT(queueSerial.getIndex() == mQueueSerialIndex);
111
112 // QueueSerials are always received in order, its either same or greater than last one
113 std::unique_lock<std::mutex> ul(mThreadMutex);
114 if (!mQueueSerials.empty())
115 {
116 QueueSerial &lastSerial = mQueueSerials.back();
117 ASSERT(queueSerial >= lastSerial);
118 if (queueSerial == lastSerial)
119 {
120 return angle::Result::Continue;
121 }
122 }
123
124 // if the queue is full, it might be that device is lost, check for timeout
125 size_t numIterations = 0;
126 while (mQueueSerials.full() && numIterations < kTimeoutCheckIterations)
127 {
128 mHasEmptySlot.wait_for(ul, std::chrono::milliseconds(kSleepInMS),
129 [this]() { return !mQueueSerials.full(); });
130 numIterations++;
131 }
132 if (numIterations == kTimeoutCheckIterations)
133 {
134 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
135 }
136
137 mQueueSerials.push(queueSerial);
138 mHasWorkSubmitted.notify_one();
139
140 return angle::Result::Continue;
141 }
142
finishLoop()143 angle::Result DispatchWorkThread::finishLoop()
144 {
145 angle::SetCurrentThreadName("ANGLE-CL-CQD");
146
147 while (true)
148 {
149 std::unique_lock<std::mutex> ul(mThreadMutex);
150 mHasWorkSubmitted.wait(ul, [this]() { return !mQueueSerials.empty() || mIsTerminating; });
151
152 while (!mQueueSerials.empty())
153 {
154 QueueSerial queueSerial = mQueueSerials.front();
155 mQueueSerials.pop();
156 mHasEmptySlot.notify_one();
157 ul.unlock();
158 // finish the work associated with the queue serial
159 ANGLE_TRY(mCommandQueue->finishQueueSerial(queueSerial));
160 ul.lock();
161 }
162
163 if (mIsTerminating)
164 {
165 break;
166 }
167 }
168 return angle::Result::Continue;
169 }
170
171 } // namespace
172
CLCommandQueueVk(const cl::CommandQueue & commandQueue)173 CLCommandQueueVk::CLCommandQueueVk(const cl::CommandQueue &commandQueue)
174 : CLCommandQueueImpl(commandQueue),
175 mContext(&commandQueue.getContext().getImpl<CLContextVk>()),
176 mDevice(&commandQueue.getDevice().getImpl<CLDeviceVk>()),
177 mPrintfBuffer(nullptr),
178 mComputePassCommands(nullptr),
179 mQueueSerialIndex(kInvalidQueueSerialIndex),
180 mNeedPrintfHandling(false),
181 mPrintfInfos(nullptr),
182 mFinishHandler(this)
183 {}
184
init()185 angle::Result CLCommandQueueVk::init()
186 {
187 vk::Renderer *renderer = mContext->getRenderer();
188 ASSERT(renderer);
189
190 ANGLE_CL_IMPL_TRY_ERROR(vk::OutsideRenderPassCommandBuffer::InitializeCommandPool(
191 mContext, &mCommandPool.outsideRenderPassPool,
192 renderer->getQueueFamilyIndex(), getProtectionType()),
193 CL_OUT_OF_RESOURCES);
194
195 ANGLE_CL_IMPL_TRY_ERROR(mContext->getRenderer()->getOutsideRenderPassCommandBufferHelper(
196 mContext, &mCommandPool.outsideRenderPassPool,
197 &mOutsideRenderPassCommandsAllocator, &mComputePassCommands),
198 CL_OUT_OF_RESOURCES);
199
200 // Generate initial QueueSerial for command buffer helper
201 ANGLE_CL_IMPL_TRY_ERROR(mContext->getRenderer()->allocateQueueSerialIndex(&mQueueSerialIndex),
202 CL_OUT_OF_RESOURCES);
203 // and set an initial queue serial for the compute pass commands
204 mComputePassCommands->setQueueSerial(
205 mQueueSerialIndex, mContext->getRenderer()->generateQueueSerial(mQueueSerialIndex));
206
207 // Initialize serials to be valid but appear submitted and finished.
208 mLastFlushedQueueSerial = QueueSerial(mQueueSerialIndex, Serial());
209 mLastSubmittedQueueSerial = mLastFlushedQueueSerial;
210
211 ANGLE_TRY(mFinishHandler.init());
212
213 return angle::Result::Continue;
214 }
215
~CLCommandQueueVk()216 CLCommandQueueVk::~CLCommandQueueVk()
217 {
218 mFinishHandler.terminate();
219
220 ASSERT(mComputePassCommands->empty());
221 ASSERT(!mNeedPrintfHandling);
222
223 if (mPrintfBuffer)
224 {
225 // The lifetime of printf buffer is scoped to command queue, release and destroy.
226 const bool wasLastUser = mPrintfBuffer->release();
227 ASSERT(wasLastUser);
228 delete mPrintfBuffer;
229 }
230
231 VkDevice vkDevice = mContext->getDevice();
232
233 if (mQueueSerialIndex != kInvalidQueueSerialIndex)
234 {
235 mContext->getRenderer()->releaseQueueSerialIndex(mQueueSerialIndex);
236 mQueueSerialIndex = kInvalidQueueSerialIndex;
237 }
238
239 // Recycle the current command buffers
240 mContext->getRenderer()->recycleOutsideRenderPassCommandBufferHelper(&mComputePassCommands);
241 mCommandPool.outsideRenderPassPool.destroy(vkDevice);
242 }
243
setProperty(cl::CommandQueueProperties properties,cl_bool enable)244 angle::Result CLCommandQueueVk::setProperty(cl::CommandQueueProperties properties, cl_bool enable)
245 {
246 // NOTE: "clSetCommandQueueProperty" has been deprecated as of OpenCL 1.1
247 // http://man.opencl.org/deprecated.html
248 return angle::Result::Continue;
249 }
250
enqueueReadBuffer(const cl::Buffer & buffer,bool blocking,size_t offset,size_t size,void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)251 angle::Result CLCommandQueueVk::enqueueReadBuffer(const cl::Buffer &buffer,
252 bool blocking,
253 size_t offset,
254 size_t size,
255 void *ptr,
256 const cl::EventPtrs &waitEvents,
257 CLEventImpl::CreateFunc *eventCreateFunc)
258 {
259 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
260
261 ANGLE_TRY(processWaitlist(waitEvents));
262 CLBufferVk *bufferVk = &buffer.getImpl<CLBufferVk>();
263
264 if (blocking)
265 {
266 ANGLE_TRY(finishInternal());
267 ANGLE_TRY(bufferVk->copyTo(ptr, offset, size));
268
269 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
270 }
271 else
272 {
273 // Stage a transfer routine
274 HostTransferConfig transferConfig;
275 transferConfig.type = CL_COMMAND_READ_BUFFER;
276 transferConfig.offset = offset;
277 transferConfig.size = size;
278 transferConfig.dstHostPtr = ptr;
279 ANGLE_TRY(addToHostTransferList(bufferVk, transferConfig));
280
281 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
282 }
283
284 return angle::Result::Continue;
285 }
286
enqueueWriteBuffer(const cl::Buffer & buffer,bool blocking,size_t offset,size_t size,const void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)287 angle::Result CLCommandQueueVk::enqueueWriteBuffer(const cl::Buffer &buffer,
288 bool blocking,
289 size_t offset,
290 size_t size,
291 const void *ptr,
292 const cl::EventPtrs &waitEvents,
293 CLEventImpl::CreateFunc *eventCreateFunc)
294 {
295 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
296
297 ANGLE_TRY(processWaitlist(waitEvents));
298
299 auto bufferVk = &buffer.getImpl<CLBufferVk>();
300
301 if (blocking)
302 {
303 ANGLE_TRY(finishInternal());
304 ANGLE_TRY(bufferVk->copyFrom(ptr, offset, size));
305
306 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
307 }
308 else
309 {
310 // Stage a transfer routine
311 HostTransferConfig config;
312 config.type = CL_COMMAND_WRITE_BUFFER;
313 config.offset = offset;
314 config.size = size;
315 config.srcHostPtr = ptr;
316 ANGLE_TRY(addToHostTransferList(bufferVk, config));
317
318 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
319 }
320
321 return angle::Result::Continue;
322 }
323
enqueueReadBufferRect(const cl::Buffer & buffer,bool blocking,const cl::MemOffsets & bufferOrigin,const cl::MemOffsets & hostOrigin,const cl::Coordinate & region,size_t bufferRowPitch,size_t bufferSlicePitch,size_t hostRowPitch,size_t hostSlicePitch,void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)324 angle::Result CLCommandQueueVk::enqueueReadBufferRect(const cl::Buffer &buffer,
325 bool blocking,
326 const cl::MemOffsets &bufferOrigin,
327 const cl::MemOffsets &hostOrigin,
328 const cl::Coordinate ®ion,
329 size_t bufferRowPitch,
330 size_t bufferSlicePitch,
331 size_t hostRowPitch,
332 size_t hostSlicePitch,
333 void *ptr,
334 const cl::EventPtrs &waitEvents,
335 CLEventImpl::CreateFunc *eventCreateFunc)
336 {
337 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
338
339 ANGLE_TRY(processWaitlist(waitEvents));
340 auto bufferVk = &buffer.getImpl<CLBufferVk>();
341
342 cl::BufferRect bufferRect{cl::Offset{bufferOrigin.x, bufferOrigin.y, bufferOrigin.z},
343 cl::Extents{region.x, region.y, region.z}, bufferRowPitch,
344 bufferSlicePitch, 1};
345
346 cl::BufferRect ptrRect{cl::Offset{hostOrigin.x, hostOrigin.y, hostOrigin.z},
347 cl::Extents{region.x, region.y, region.z}, hostRowPitch, hostSlicePitch,
348 1};
349
350 if (blocking)
351 {
352 ANGLE_TRY(finishInternal());
353 ANGLE_TRY(bufferVk->getRect(bufferRect, ptrRect, ptr));
354 }
355 else
356 {
357 // Stage a transfer routine
358 HostTransferConfig config;
359 config.type = CL_COMMAND_READ_BUFFER_RECT;
360 config.srcRect = bufferRect;
361 config.dstRect = ptrRect;
362 config.dstHostPtr = ptr;
363 config.size = bufferVk->getSize();
364 ANGLE_TRY(addToHostTransferList(bufferVk, config));
365 }
366
367 ANGLE_TRY(createEvent(eventCreateFunc,
368 blocking ? cl::ExecutionStatus::Complete : cl::ExecutionStatus::Queued));
369 return angle::Result::Continue;
370 }
371
enqueueWriteBufferRect(const cl::Buffer & buffer,bool blocking,const cl::MemOffsets & bufferOrigin,const cl::MemOffsets & hostOrigin,const cl::Coordinate & region,size_t bufferRowPitch,size_t bufferSlicePitch,size_t hostRowPitch,size_t hostSlicePitch,const void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)372 angle::Result CLCommandQueueVk::enqueueWriteBufferRect(const cl::Buffer &buffer,
373 bool blocking,
374 const cl::MemOffsets &bufferOrigin,
375 const cl::MemOffsets &hostOrigin,
376 const cl::Coordinate ®ion,
377 size_t bufferRowPitch,
378 size_t bufferSlicePitch,
379 size_t hostRowPitch,
380 size_t hostSlicePitch,
381 const void *ptr,
382 const cl::EventPtrs &waitEvents,
383 CLEventImpl::CreateFunc *eventCreateFunc)
384 {
385 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
386
387 ANGLE_TRY(processWaitlist(waitEvents));
388 auto bufferVk = &buffer.getImpl<CLBufferVk>();
389
390 cl::BufferRect bufferRect{cl::Offset{bufferOrigin.x, bufferOrigin.y, bufferOrigin.z},
391 cl::Extents{region.x, region.y, region.z}, bufferRowPitch,
392 bufferSlicePitch, 1};
393
394 cl::BufferRect ptrRect{cl::Offset{hostOrigin.x, hostOrigin.y, hostOrigin.z},
395 cl::Extents{region.x, region.y, region.z}, hostRowPitch, hostSlicePitch,
396 1};
397
398 if (blocking)
399 {
400 ANGLE_TRY(finishInternal());
401 ANGLE_TRY(bufferVk->setRect(ptr, ptrRect, bufferRect));
402 }
403 else
404 {
405 // Stage a transfer routine
406 HostTransferConfig config;
407 config.type = CL_COMMAND_WRITE_BUFFER_RECT;
408 config.srcRect = ptrRect;
409 config.dstRect = bufferRect;
410 config.srcHostPtr = ptr;
411 config.size = bufferVk->getSize();
412 ANGLE_TRY(addToHostTransferList(bufferVk, config));
413 }
414
415 ANGLE_TRY(createEvent(eventCreateFunc,
416 blocking ? cl::ExecutionStatus::Complete : cl::ExecutionStatus::Queued));
417 return angle::Result::Continue;
418 }
419
enqueueCopyBuffer(const cl::Buffer & srcBuffer,const cl::Buffer & dstBuffer,size_t srcOffset,size_t dstOffset,size_t size,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)420 angle::Result CLCommandQueueVk::enqueueCopyBuffer(const cl::Buffer &srcBuffer,
421 const cl::Buffer &dstBuffer,
422 size_t srcOffset,
423 size_t dstOffset,
424 size_t size,
425 const cl::EventPtrs &waitEvents,
426 CLEventImpl::CreateFunc *eventCreateFunc)
427 {
428 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
429
430 ANGLE_TRY(processWaitlist(waitEvents));
431
432 CLBufferVk *srcBufferVk = &srcBuffer.getImpl<CLBufferVk>();
433 CLBufferVk *dstBufferVk = &dstBuffer.getImpl<CLBufferVk>();
434
435 vk::CommandBufferAccess access;
436 if (srcBufferVk->isSubBuffer() && dstBufferVk->isSubBuffer() &&
437 (srcBufferVk->getParent() == dstBufferVk->getParent()))
438 {
439 // this is a self copy
440 access.onBufferSelfCopy(&srcBufferVk->getBuffer());
441 }
442 else
443 {
444 access.onBufferTransferRead(&srcBufferVk->getBuffer());
445 access.onBufferTransferWrite(&dstBufferVk->getBuffer());
446 }
447
448 vk::OutsideRenderPassCommandBuffer *commandBuffer;
449 ANGLE_TRY(getCommandBuffer(access, &commandBuffer));
450
451 VkBufferCopy copyRegion = {srcOffset, dstOffset, size};
452 // update the offset in the case of sub-buffers
453 if (srcBufferVk->getOffset())
454 {
455 copyRegion.srcOffset += srcBufferVk->getOffset();
456 }
457 if (dstBufferVk->getOffset())
458 {
459 copyRegion.dstOffset += dstBufferVk->getOffset();
460 }
461 commandBuffer->copyBuffer(srcBufferVk->getBuffer().getBuffer(),
462 dstBufferVk->getBuffer().getBuffer(), 1, ©Region);
463
464 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
465
466 return angle::Result::Continue;
467 }
468
enqueueCopyBufferRect(const cl::Buffer & srcBuffer,const cl::Buffer & dstBuffer,const cl::MemOffsets & srcOrigin,const cl::MemOffsets & dstOrigin,const cl::Coordinate & region,size_t srcRowPitch,size_t srcSlicePitch,size_t dstRowPitch,size_t dstSlicePitch,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)469 angle::Result CLCommandQueueVk::enqueueCopyBufferRect(const cl::Buffer &srcBuffer,
470 const cl::Buffer &dstBuffer,
471 const cl::MemOffsets &srcOrigin,
472 const cl::MemOffsets &dstOrigin,
473 const cl::Coordinate ®ion,
474 size_t srcRowPitch,
475 size_t srcSlicePitch,
476 size_t dstRowPitch,
477 size_t dstSlicePitch,
478 const cl::EventPtrs &waitEvents,
479 CLEventImpl::CreateFunc *eventCreateFunc)
480 {
481 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
482 ANGLE_TRY(processWaitlist(waitEvents));
483 ANGLE_TRY(finishInternal());
484
485 cl::BufferRect srcRect{cl::Offset{srcOrigin.x, srcOrigin.y, srcOrigin.z},
486 cl::Extents{region.x, region.y, region.z}, srcRowPitch, srcSlicePitch,
487 1};
488
489 cl::BufferRect dstRect{cl::Offset{dstOrigin.x, dstOrigin.y, dstOrigin.z},
490 cl::Extents{region.x, region.y, region.z}, dstRowPitch, dstSlicePitch,
491 1};
492
493 auto srcBufferVk = &srcBuffer.getImpl<CLBufferVk>();
494 auto dstBufferVk = &dstBuffer.getImpl<CLBufferVk>();
495 uint8_t *mapPointer = nullptr;
496 ANGLE_TRY(srcBufferVk->map(mapPointer));
497 ASSERT(mapPointer);
498 ANGLE_TRY(dstBufferVk->setRect(static_cast<const void *>(mapPointer), srcRect, dstRect));
499
500 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
501 return angle::Result::Continue;
502 }
503
enqueueFillBuffer(const cl::Buffer & buffer,const void * pattern,size_t patternSize,size_t offset,size_t size,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)504 angle::Result CLCommandQueueVk::enqueueFillBuffer(const cl::Buffer &buffer,
505 const void *pattern,
506 size_t patternSize,
507 size_t offset,
508 size_t size,
509 const cl::EventPtrs &waitEvents,
510 CLEventImpl::CreateFunc *eventCreateFunc)
511 {
512 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
513
514 ANGLE_TRY(processWaitlist(waitEvents));
515
516 CLBufferVk *bufferVk = &buffer.getImpl<CLBufferVk>();
517
518 // Stage a transfer routine
519 HostTransferConfig config;
520 config.type = CL_COMMAND_FILL_BUFFER;
521 config.patternSize = patternSize;
522 config.offset = offset;
523 config.size = size;
524 config.srcHostPtr = pattern;
525 ANGLE_TRY(addToHostTransferList(bufferVk, config));
526
527 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
528
529 return angle::Result::Continue;
530 }
531
enqueueMapBuffer(const cl::Buffer & buffer,bool blocking,cl::MapFlags mapFlags,size_t offset,size_t size,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc,void * & mapPtr)532 angle::Result CLCommandQueueVk::enqueueMapBuffer(const cl::Buffer &buffer,
533 bool blocking,
534 cl::MapFlags mapFlags,
535 size_t offset,
536 size_t size,
537 const cl::EventPtrs &waitEvents,
538 CLEventImpl::CreateFunc *eventCreateFunc,
539 void *&mapPtr)
540 {
541 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
542
543 ANGLE_TRY(processWaitlist(waitEvents));
544
545 cl::ExecutionStatus eventComplete = cl::ExecutionStatus::Queued;
546 if (blocking || !eventCreateFunc)
547 {
548 ANGLE_TRY(finishInternal());
549 eventComplete = cl::ExecutionStatus::Complete;
550 }
551
552 CLBufferVk *bufferVk = &buffer.getImpl<CLBufferVk>();
553 uint8_t *mapPointer = nullptr;
554 if (buffer.getFlags().intersects(CL_MEM_USE_HOST_PTR))
555 {
556 ANGLE_TRY(finishInternal());
557 mapPointer = static_cast<uint8_t *>(buffer.getHostPtr()) + offset;
558 ANGLE_TRY(bufferVk->copyTo(mapPointer, offset, size));
559 eventComplete = cl::ExecutionStatus::Complete;
560 }
561 else
562 {
563 ANGLE_TRY(bufferVk->map(mapPointer, offset));
564 }
565 mapPtr = static_cast<void *>(mapPointer);
566
567 if (bufferVk->isCurrentlyInUse())
568 {
569 eventComplete = cl::ExecutionStatus::Queued;
570 }
571 ANGLE_TRY(createEvent(eventCreateFunc, eventComplete));
572
573 return angle::Result::Continue;
574 }
575
copyImageToFromBuffer(CLImageVk & imageVk,vk::BufferHelper & buffer,const cl::MemOffsets & origin,const cl::Coordinate & region,size_t bufferOffset,ImageBufferCopyDirection direction)576 angle::Result CLCommandQueueVk::copyImageToFromBuffer(CLImageVk &imageVk,
577 vk::BufferHelper &buffer,
578 const cl::MemOffsets &origin,
579 const cl::Coordinate ®ion,
580 size_t bufferOffset,
581 ImageBufferCopyDirection direction)
582 {
583 vk::CommandBufferAccess access;
584 vk::OutsideRenderPassCommandBuffer *commandBuffer;
585 VkImageAspectFlags aspectFlags = imageVk.getImage().getAspectFlags();
586 if (direction == ImageBufferCopyDirection::ToBuffer)
587 {
588 access.onImageTransferRead(aspectFlags, &imageVk.getImage());
589 access.onBufferTransferWrite(&buffer);
590 }
591 else
592 {
593 access.onImageTransferWrite(gl::LevelIndex(0), 1, 0,
594 static_cast<uint32_t>(imageVk.getArraySize()), aspectFlags,
595 &imageVk.getImage());
596 access.onBufferTransferRead(&buffer);
597 }
598 ANGLE_TRY(getCommandBuffer(access, &commandBuffer));
599
600 VkBufferImageCopy copyRegion = {};
601 copyRegion.bufferOffset = bufferOffset;
602 copyRegion.bufferRowLength = 0;
603 copyRegion.bufferImageHeight = 0;
604 copyRegion.imageExtent = cl_vk::GetExtent(imageVk.getExtentForCopy(region));
605 copyRegion.imageOffset = cl_vk::GetOffset(imageVk.getOffsetForCopy(origin));
606 copyRegion.imageSubresource = imageVk.getSubresourceLayersForCopy(
607 origin, region, imageVk.getType(), ImageCopyWith::Buffer);
608 if (imageVk.isWritable())
609 {
610 // We need an execution barrier if image can be written to by kernel
611 ANGLE_TRY(insertBarrier());
612 }
613
614 VkMemoryBarrier memBarrier = {};
615 memBarrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
616 memBarrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;
617 memBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT;
618 if (direction == ImageBufferCopyDirection::ToBuffer)
619 {
620 commandBuffer->copyImageToBuffer(imageVk.getImage().getImage(),
621 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
622 buffer.getBuffer().getHandle(), 1, ©Region);
623
624 mComputePassCommands->getCommandBuffer().pipelineBarrier(
625 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0, 1, &memBarrier, 0,
626 nullptr, 0, nullptr);
627 }
628 else
629 {
630 commandBuffer->copyBufferToImage(buffer.getBuffer().getHandle(),
631 imageVk.getImage().getImage(),
632 VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ©Region);
633
634 mComputePassCommands->getCommandBuffer().pipelineBarrier(
635 VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memBarrier,
636 0, nullptr, 0, nullptr);
637 }
638
639 return angle::Result::Continue;
640 }
641
addToHostTransferList(CLBufferVk * srcBuffer,HostTransferConfig transferConfig)642 angle::Result CLCommandQueueVk::addToHostTransferList(CLBufferVk *srcBuffer,
643 HostTransferConfig transferConfig)
644 {
645 // TODO(aannestrand): Flush here if we reach some max-transfer-buffer heuristic
646 // http://anglebug.com/377545840
647
648 cl::Memory *transferBufferHandle =
649 cl::Buffer::Cast(this->mContext->getFrontendObject().createBuffer(
650 nullptr, cl::MemFlags{CL_MEM_READ_WRITE}, srcBuffer->getSize(), nullptr));
651 if (transferBufferHandle == nullptr)
652 {
653 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
654 }
655 HostTransferEntry transferEntry{transferConfig, cl::MemoryPtr{transferBufferHandle}};
656 mCommandsStateMap[mComputePassCommands->getQueueSerial()].hostTransferList.emplace_back(
657 transferEntry);
658
659 // Release initialization reference, lifetime controlled by RefPointer.
660 transferBufferHandle->release();
661
662 // We need an execution barrier if buffer can be written to by kernel
663 if (!mComputePassCommands->getCommandBuffer().empty() && srcBuffer->isWritable())
664 {
665 // TODO(aannestrand): Look into combining these kernel execution barriers
666 // http://anglebug.com/377545840
667 VkMemoryBarrier memoryBarrier = {VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr,
668 VK_ACCESS_SHADER_WRITE_BIT,
669 VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT};
670 mComputePassCommands->getCommandBuffer().pipelineBarrier(
671 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1,
672 &memoryBarrier, 0, nullptr, 0, nullptr);
673 }
674
675 // Enqueue blit/transfer cmd
676 VkPipelineStageFlags srcStageMask = {};
677 VkPipelineStageFlags dstStageMask = {};
678 VkMemoryBarrier memBarrier = {};
679 memBarrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
680 CLBufferVk &transferBufferHandleVk = transferBufferHandle->getImpl<CLBufferVk>();
681 switch (transferConfig.type)
682 {
683 case CL_COMMAND_WRITE_BUFFER:
684 {
685 VkBufferCopy copyRegion = {transferConfig.offset, transferConfig.offset,
686 transferConfig.size};
687 ANGLE_TRY(transferBufferHandleVk.copyFrom(transferConfig.srcHostPtr,
688 transferConfig.offset, transferConfig.size));
689 copyRegion.srcOffset += transferBufferHandleVk.getOffset();
690 copyRegion.dstOffset += srcBuffer->getOffset();
691 mComputePassCommands->getCommandBuffer().copyBuffer(
692 transferBufferHandleVk.getBuffer().getBuffer(), srcBuffer->getBuffer().getBuffer(),
693 1, ©Region);
694
695 srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT;
696 dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
697 memBarrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;
698 memBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT;
699 break;
700 }
701 case CL_COMMAND_WRITE_BUFFER_RECT:
702 {
703 ANGLE_TRY(transferBufferHandleVk.setRect(
704 transferConfig.srcHostPtr, transferConfig.srcRect, transferConfig.dstRect));
705 for (VkBufferCopy ©Region :
706 transferBufferHandleVk.rectCopyRegions(transferConfig.dstRect))
707 {
708 copyRegion.srcOffset += transferBufferHandleVk.getOffset();
709 copyRegion.dstOffset += srcBuffer->getOffset();
710 mComputePassCommands->getCommandBuffer().copyBuffer(
711 transferBufferHandleVk.getBuffer().getBuffer(),
712 srcBuffer->getBuffer().getBuffer(), 1, ©Region);
713 }
714
715 // Config transfer barrier
716 srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT;
717 dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
718 memBarrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;
719 memBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT;
720 break;
721 }
722 case CL_COMMAND_READ_BUFFER:
723 {
724 VkBufferCopy copyRegion = {transferConfig.offset, transferConfig.offset,
725 transferConfig.size};
726 copyRegion.srcOffset += srcBuffer->getOffset();
727 copyRegion.dstOffset += transferBufferHandleVk.getOffset();
728 mComputePassCommands->getCommandBuffer().copyBuffer(
729 srcBuffer->getBuffer().getBuffer(), transferBufferHandleVk.getBuffer().getBuffer(),
730 1, ©Region);
731
732 srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
733 dstStageMask = VK_PIPELINE_STAGE_HOST_BIT;
734 memBarrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;
735 memBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT;
736 break;
737 }
738 case CL_COMMAND_READ_BUFFER_RECT:
739 {
740 for (VkBufferCopy ©Region :
741 transferBufferHandleVk.rectCopyRegions(transferConfig.srcRect))
742 {
743 copyRegion.srcOffset += srcBuffer->getOffset();
744 copyRegion.dstOffset += transferBufferHandleVk.getOffset();
745 mComputePassCommands->getCommandBuffer().copyBuffer(
746 srcBuffer->getBuffer().getBuffer(),
747 transferBufferHandleVk.getBuffer().getBuffer(), 1, ©Region);
748 }
749
750 // Config transfer barrier
751 srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
752 dstStageMask = VK_PIPELINE_STAGE_HOST_BIT;
753 memBarrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;
754 memBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT;
755 break;
756 }
757 case CL_COMMAND_FILL_BUFFER:
758 {
759 VkBufferCopy copyRegion = {transferConfig.offset, transferConfig.offset,
760 transferConfig.size};
761 ANGLE_TRY(transferBufferHandleVk.fillWithPattern(
762 transferConfig.srcHostPtr, transferConfig.patternSize, transferConfig.offset,
763 transferConfig.size));
764 copyRegion.srcOffset += transferBufferHandleVk.getOffset();
765 copyRegion.dstOffset += srcBuffer->getOffset();
766 mComputePassCommands->getCommandBuffer().copyBuffer(
767 transferBufferHandleVk.getBuffer().getBuffer(), srcBuffer->getBuffer().getBuffer(),
768 1, ©Region);
769
770 // Config transfer barrier
771 srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT;
772 dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
773 memBarrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;
774 memBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT;
775 break;
776 }
777 default:
778 UNIMPLEMENTED();
779 break;
780 }
781
782 // TODO(aannestrand): Look into combining these transfer barriers
783 // http://anglebug.com/377545840
784 mComputePassCommands->getCommandBuffer().pipelineBarrier(srcStageMask, dstStageMask, 0, 1,
785 &memBarrier, 0, nullptr, 0, nullptr);
786
787 return angle::Result::Continue;
788 }
789
addToHostTransferList(CLImageVk * srcImage,HostTransferConfig transferConfig)790 angle::Result CLCommandQueueVk::addToHostTransferList(CLImageVk *srcImage,
791 HostTransferConfig transferConfig)
792 {
793 // TODO(aannestrand): Flush here if we reach some max-transfer-buffer heuristic
794 // http://anglebug.com/377545840
795 CommandsState &commandsState = mCommandsStateMap[mComputePassCommands->getQueueSerial()];
796
797 cl::Memory *transferBufferHandle =
798 cl::Buffer::Cast(this->mContext->getFrontendObject().createBuffer(
799 nullptr, cl::MemFlags{CL_MEM_READ_WRITE}, srcImage->getSize(), nullptr));
800 if (transferBufferHandle == nullptr)
801 {
802 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
803 }
804
805 HostTransferEntry transferEntry{transferConfig, cl::MemoryPtr{transferBufferHandle}};
806 commandsState.hostTransferList.emplace_back(transferEntry);
807
808 // Release initialization reference, lifetime controlled by RefPointer.
809 transferBufferHandle->release();
810
811 // Enqueue blit
812 CLBufferVk &transferBufferHandleVk = transferBufferHandle->getImpl<CLBufferVk>();
813 ANGLE_TRY(copyImageToFromBuffer(*srcImage, transferBufferHandleVk.getBuffer(),
814 transferConfig.origin, transferConfig.region, 0,
815 ImageBufferCopyDirection::ToBuffer));
816
817 return angle::Result::Continue;
818 }
819
enqueueReadImage(const cl::Image & image,bool blocking,const cl::MemOffsets & origin,const cl::Coordinate & region,size_t rowPitch,size_t slicePitch,void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)820 angle::Result CLCommandQueueVk::enqueueReadImage(const cl::Image &image,
821 bool blocking,
822 const cl::MemOffsets &origin,
823 const cl::Coordinate ®ion,
824 size_t rowPitch,
825 size_t slicePitch,
826 void *ptr,
827 const cl::EventPtrs &waitEvents,
828 CLEventImpl::CreateFunc *eventCreateFunc)
829 {
830 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
831 CLImageVk &imageVk = image.getImpl<CLImageVk>();
832 size_t size = (region.x * region.y * region.z * imageVk.getElementSize());
833
834 ANGLE_TRY(processWaitlist(waitEvents));
835
836 if (imageVk.isStagingBufferInitialized() == false)
837 {
838 ANGLE_TRY(imageVk.createStagingBuffer(imageVk.getSize()));
839 }
840
841 if (blocking)
842 {
843 ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), origin, region, 0,
844 ImageBufferCopyDirection::ToBuffer));
845 ANGLE_TRY(finishInternal());
846 if (rowPitch == 0 && slicePitch == 0)
847 {
848 ANGLE_TRY(imageVk.copyStagingTo(ptr, 0, size));
849 }
850 else
851 {
852 ANGLE_TRY(imageVk.copyStagingToFromWithPitch(ptr, region, rowPitch, slicePitch,
853 StagingBufferCopyDirection::ToHost));
854 }
855 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
856 }
857 else
858 {
859 // Create a transfer buffer and push it in update list
860 HostTransferConfig transferConfig;
861 transferConfig.type = CL_COMMAND_READ_IMAGE;
862 transferConfig.size = size;
863 transferConfig.dstHostPtr = ptr;
864 transferConfig.origin = origin;
865 transferConfig.region = region;
866 transferConfig.rowPitch = rowPitch;
867 transferConfig.slicePitch = slicePitch;
868 transferConfig.elementSize = imageVk.getElementSize();
869 ANGLE_TRY(addToHostTransferList(&imageVk, transferConfig));
870
871 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
872 }
873
874 return angle::Result::Continue;
875 }
876
enqueueWriteImage(const cl::Image & image,bool blocking,const cl::MemOffsets & origin,const cl::Coordinate & region,size_t inputRowPitch,size_t inputSlicePitch,const void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)877 angle::Result CLCommandQueueVk::enqueueWriteImage(const cl::Image &image,
878 bool blocking,
879 const cl::MemOffsets &origin,
880 const cl::Coordinate ®ion,
881 size_t inputRowPitch,
882 size_t inputSlicePitch,
883 const void *ptr,
884 const cl::EventPtrs &waitEvents,
885 CLEventImpl::CreateFunc *eventCreateFunc)
886 {
887 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
888 ANGLE_TRY(processWaitlist(waitEvents));
889
890 CLImageVk &imageVk = image.getImpl<CLImageVk>();
891 size_t size = (region.x * region.y * region.z * imageVk.getElementSize());
892 cl::ExecutionStatus eventInitialState = cl::ExecutionStatus::Queued;
893 if (imageVk.isStagingBufferInitialized() == false)
894 {
895 ANGLE_TRY(imageVk.createStagingBuffer(imageVk.getSize()));
896 }
897
898 if (inputRowPitch == 0 && inputSlicePitch == 0)
899 {
900 ANGLE_TRY(imageVk.copyStagingFrom((void *)ptr, 0, size));
901 }
902 else
903 {
904 ANGLE_TRY(imageVk.copyStagingToFromWithPitch((void *)ptr, region, inputRowPitch,
905 inputSlicePitch,
906 StagingBufferCopyDirection::ToStagingBuffer));
907 }
908
909 ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), origin, region, 0,
910 ImageBufferCopyDirection::ToImage));
911
912 if (blocking)
913 {
914 ANGLE_TRY(finishInternal());
915 eventInitialState = cl::ExecutionStatus::Complete;
916 }
917
918 ANGLE_TRY(createEvent(eventCreateFunc, eventInitialState));
919
920 return angle::Result::Continue;
921 }
922
enqueueCopyImage(const cl::Image & srcImage,const cl::Image & dstImage,const cl::MemOffsets & srcOrigin,const cl::MemOffsets & dstOrigin,const cl::Coordinate & region,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)923 angle::Result CLCommandQueueVk::enqueueCopyImage(const cl::Image &srcImage,
924 const cl::Image &dstImage,
925 const cl::MemOffsets &srcOrigin,
926 const cl::MemOffsets &dstOrigin,
927 const cl::Coordinate ®ion,
928 const cl::EventPtrs &waitEvents,
929 CLEventImpl::CreateFunc *eventCreateFunc)
930 {
931 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
932 ANGLE_TRY(processWaitlist(waitEvents));
933
934 auto srcImageVk = &srcImage.getImpl<CLImageVk>();
935 auto dstImageVk = &dstImage.getImpl<CLImageVk>();
936
937 vk::CommandBufferAccess access;
938 vk::OutsideRenderPassCommandBuffer *commandBuffer;
939 VkImageAspectFlags dstAspectFlags = srcImageVk->getImage().getAspectFlags();
940 VkImageAspectFlags srcAspectFlags = dstImageVk->getImage().getAspectFlags();
941 access.onImageTransferWrite(gl::LevelIndex(0), 1, 0, 1, dstAspectFlags,
942 &dstImageVk->getImage());
943 access.onImageTransferRead(srcAspectFlags, &srcImageVk->getImage());
944 ANGLE_TRY(getCommandBuffer(access, &commandBuffer));
945
946 VkImageCopy copyRegion = {};
947 copyRegion.extent = cl_vk::GetExtent(srcImageVk->getExtentForCopy(region));
948 copyRegion.srcOffset = cl_vk::GetOffset(srcImageVk->getOffsetForCopy(srcOrigin));
949 copyRegion.dstOffset = cl_vk::GetOffset(dstImageVk->getOffsetForCopy(dstOrigin));
950 copyRegion.srcSubresource = srcImageVk->getSubresourceLayersForCopy(
951 srcOrigin, region, dstImageVk->getType(), ImageCopyWith::Image);
952 copyRegion.dstSubresource = dstImageVk->getSubresourceLayersForCopy(
953 dstOrigin, region, srcImageVk->getType(), ImageCopyWith::Image);
954 if (srcImageVk->isWritable() || dstImageVk->isWritable())
955 {
956 // We need an execution barrier if buffer can be written to by kernel
957 ANGLE_TRY(insertBarrier());
958 }
959
960 commandBuffer->copyImage(
961 srcImageVk->getImage().getImage(), VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
962 dstImageVk->getImage().getImage(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ©Region);
963
964 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
965
966 return angle::Result::Continue;
967 }
968
enqueueFillImage(const cl::Image & image,const void * fillColor,const cl::MemOffsets & origin,const cl::Coordinate & region,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)969 angle::Result CLCommandQueueVk::enqueueFillImage(const cl::Image &image,
970 const void *fillColor,
971 const cl::MemOffsets &origin,
972 const cl::Coordinate ®ion,
973 const cl::EventPtrs &waitEvents,
974 CLEventImpl::CreateFunc *eventCreateFunc)
975 {
976 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
977
978 ANGLE_TRY(processWaitlist(waitEvents));
979
980 CLImageVk &imageVk = image.getImpl<CLImageVk>();
981 PixelColor packedColor;
982 cl::Extents extent = imageVk.getImageExtent();
983
984 imageVk.packPixels(fillColor, &packedColor);
985
986 if (imageVk.isStagingBufferInitialized() == false)
987 {
988 ANGLE_TRY(imageVk.createStagingBuffer(imageVk.getSize()));
989 }
990
991 ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), cl::kMemOffsetsZero,
992 {extent.width, extent.height, extent.depth}, 0,
993 ImageBufferCopyDirection::ToBuffer));
994 ANGLE_TRY(finishInternal());
995
996 uint8_t *mapPointer = nullptr;
997 ANGLE_TRY(imageVk.map(mapPointer, 0));
998 imageVk.fillImageWithColor(origin, region, mapPointer, &packedColor);
999 imageVk.unmap();
1000 mapPointer = nullptr;
1001 ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), cl::kMemOffsetsZero,
1002 {extent.width, extent.height, extent.depth}, 0,
1003 ImageBufferCopyDirection::ToImage));
1004
1005 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
1006
1007 return angle::Result::Continue;
1008 }
1009
enqueueCopyImageToBuffer(const cl::Image & srcImage,const cl::Buffer & dstBuffer,const cl::MemOffsets & srcOrigin,const cl::Coordinate & region,size_t dstOffset,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1010 angle::Result CLCommandQueueVk::enqueueCopyImageToBuffer(const cl::Image &srcImage,
1011 const cl::Buffer &dstBuffer,
1012 const cl::MemOffsets &srcOrigin,
1013 const cl::Coordinate ®ion,
1014 size_t dstOffset,
1015 const cl::EventPtrs &waitEvents,
1016 CLEventImpl::CreateFunc *eventCreateFunc)
1017 {
1018 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1019 CLImageVk &srcImageVk = srcImage.getImpl<CLImageVk>();
1020 CLBufferVk &dstBufferVk = dstBuffer.getImpl<CLBufferVk>();
1021
1022 ANGLE_TRY(processWaitlist(waitEvents));
1023
1024 ANGLE_TRY(copyImageToFromBuffer(srcImageVk, dstBufferVk.getBuffer(), srcOrigin, region,
1025 dstOffset, ImageBufferCopyDirection::ToBuffer));
1026
1027 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
1028
1029 return angle::Result::Continue;
1030 }
1031
enqueueCopyBufferToImage(const cl::Buffer & srcBuffer,const cl::Image & dstImage,size_t srcOffset,const cl::MemOffsets & dstOrigin,const cl::Coordinate & region,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1032 angle::Result CLCommandQueueVk::enqueueCopyBufferToImage(const cl::Buffer &srcBuffer,
1033 const cl::Image &dstImage,
1034 size_t srcOffset,
1035 const cl::MemOffsets &dstOrigin,
1036 const cl::Coordinate ®ion,
1037 const cl::EventPtrs &waitEvents,
1038 CLEventImpl::CreateFunc *eventCreateFunc)
1039 {
1040 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1041 CLBufferVk &srcBufferVk = srcBuffer.getImpl<CLBufferVk>();
1042 CLImageVk &dstImageVk = dstImage.getImpl<CLImageVk>();
1043
1044 ANGLE_TRY(processWaitlist(waitEvents));
1045
1046 ANGLE_TRY(copyImageToFromBuffer(dstImageVk, srcBufferVk.getBuffer(), dstOrigin, region,
1047 srcOffset, ImageBufferCopyDirection::ToImage));
1048
1049 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
1050
1051 return angle::Result::Continue;
1052 }
1053
enqueueMapImage(const cl::Image & image,bool blocking,cl::MapFlags mapFlags,const cl::MemOffsets & origin,const cl::Coordinate & region,size_t * imageRowPitch,size_t * imageSlicePitch,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc,void * & mapPtr)1054 angle::Result CLCommandQueueVk::enqueueMapImage(const cl::Image &image,
1055 bool blocking,
1056 cl::MapFlags mapFlags,
1057 const cl::MemOffsets &origin,
1058 const cl::Coordinate ®ion,
1059 size_t *imageRowPitch,
1060 size_t *imageSlicePitch,
1061 const cl::EventPtrs &waitEvents,
1062 CLEventImpl::CreateFunc *eventCreateFunc,
1063 void *&mapPtr)
1064 {
1065 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1066
1067 ANGLE_TRY(processWaitlist(waitEvents));
1068
1069 // TODO: Look into better enqueue handling of this map-op if non-blocking
1070 // https://anglebug.com/376722715
1071 CLImageVk *imageVk = &image.getImpl<CLImageVk>();
1072 cl::Extents extent = imageVk->getImageExtent();
1073 if (blocking)
1074 {
1075 ANGLE_TRY(finishInternal());
1076 }
1077
1078 mComputePassCommands->imageRead(mContext, imageVk->getImage().getAspectFlags(),
1079 vk::ImageLayout::TransferSrc, &imageVk->getImage());
1080
1081 if (imageVk->isStagingBufferInitialized() == false)
1082 {
1083 ANGLE_TRY(imageVk->createStagingBuffer(imageVk->getSize()));
1084 }
1085
1086 ANGLE_TRY(copyImageToFromBuffer(*imageVk, imageVk->getStagingBuffer(), cl::kMemOffsetsZero,
1087 {extent.width, extent.height, extent.depth}, 0,
1088 ImageBufferCopyDirection::ToBuffer));
1089 if (blocking)
1090 {
1091 ANGLE_TRY(finishInternal());
1092 }
1093
1094 uint8_t *mapPointer = nullptr;
1095 size_t elementSize = imageVk->getElementSize();
1096 size_t rowPitch = (extent.width * elementSize);
1097 size_t offset =
1098 (origin.x * elementSize) + (origin.y * rowPitch) + (origin.z * extent.height * rowPitch);
1099 size_t size = (region.x * region.y * region.z * elementSize);
1100
1101 if (image.getFlags().intersects(CL_MEM_USE_HOST_PTR))
1102 {
1103 mapPointer = static_cast<uint8_t *>(image.getHostPtr()) + offset;
1104 ANGLE_TRY(imageVk->copyTo(mapPointer, offset, size));
1105 }
1106 else
1107 {
1108 ANGLE_TRY(imageVk->map(mapPointer, offset));
1109 }
1110 mapPtr = static_cast<void *>(mapPointer);
1111
1112 *imageRowPitch = rowPitch;
1113
1114 switch (imageVk->getDescriptor().type)
1115 {
1116 case cl::MemObjectType::Image1D:
1117 case cl::MemObjectType::Image1D_Buffer:
1118 case cl::MemObjectType::Image2D:
1119 if (imageSlicePitch != nullptr)
1120 {
1121 *imageSlicePitch = 0;
1122 }
1123 break;
1124 case cl::MemObjectType::Image2D_Array:
1125 case cl::MemObjectType::Image3D:
1126 *imageSlicePitch = (extent.height * (*imageRowPitch));
1127 break;
1128 case cl::MemObjectType::Image1D_Array:
1129 *imageSlicePitch = *imageRowPitch;
1130 break;
1131 default:
1132 UNREACHABLE();
1133 break;
1134 }
1135
1136 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
1137
1138 return angle::Result::Continue;
1139 }
1140
enqueueUnmapMemObject(const cl::Memory & memory,void * mappedPtr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1141 angle::Result CLCommandQueueVk::enqueueUnmapMemObject(const cl::Memory &memory,
1142 void *mappedPtr,
1143 const cl::EventPtrs &waitEvents,
1144 CLEventImpl::CreateFunc *eventCreateFunc)
1145 {
1146 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1147
1148 ANGLE_TRY(processWaitlist(waitEvents));
1149
1150 cl::ExecutionStatus eventComplete = cl::ExecutionStatus::Queued;
1151 if (!eventCreateFunc)
1152 {
1153 ANGLE_TRY(finishInternal());
1154 eventComplete = cl::ExecutionStatus::Complete;
1155 }
1156
1157 if (memory.getType() == cl::MemObjectType::Buffer)
1158 {
1159 CLBufferVk &bufferVk = memory.getImpl<CLBufferVk>();
1160 if (memory.getFlags().intersects(CL_MEM_USE_HOST_PTR))
1161 {
1162 ANGLE_TRY(finishInternal());
1163 ANGLE_TRY(bufferVk.copyFrom(memory.getHostPtr(), 0, bufferVk.getSize()));
1164 eventComplete = cl::ExecutionStatus::Complete;
1165 }
1166 }
1167 else if (memory.getType() != cl::MemObjectType::Pipe)
1168 {
1169 // of image type
1170 CLImageVk &imageVk = memory.getImpl<CLImageVk>();
1171 if (memory.getFlags().intersects(CL_MEM_USE_HOST_PTR))
1172 {
1173 uint8_t *mapPointer = static_cast<uint8_t *>(memory.getHostPtr());
1174 ANGLE_TRY(imageVk.copyStagingFrom(mapPointer, 0, imageVk.getSize()));
1175 }
1176 cl::Extents extent = imageVk.getImageExtent();
1177 ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), cl::kMemOffsetsZero,
1178 {extent.width, extent.height, extent.depth}, 0,
1179 ImageBufferCopyDirection::ToImage));
1180 ANGLE_TRY(finishInternal());
1181 eventComplete = cl::ExecutionStatus::Complete;
1182 }
1183 else
1184 {
1185 // mem object type pipe is not supported and creation of such an object should have
1186 // failed
1187 UNREACHABLE();
1188 }
1189
1190 memory.getImpl<CLMemoryVk>().unmap();
1191 ANGLE_TRY(createEvent(eventCreateFunc, eventComplete));
1192
1193 return angle::Result::Continue;
1194 }
1195
enqueueMigrateMemObjects(const cl::MemoryPtrs & memObjects,cl::MemMigrationFlags flags,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1196 angle::Result CLCommandQueueVk::enqueueMigrateMemObjects(const cl::MemoryPtrs &memObjects,
1197 cl::MemMigrationFlags flags,
1198 const cl::EventPtrs &waitEvents,
1199 CLEventImpl::CreateFunc *eventCreateFunc)
1200 {
1201 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1202
1203 ANGLE_TRY(processWaitlist(waitEvents));
1204
1205 if (mCommandQueue.getContext().getDevices().size() > 1)
1206 {
1207 // TODO(aannestrand): Later implement support to allow migration of mem objects across
1208 // different devices. http://anglebug.com/377942759
1209 UNIMPLEMENTED();
1210 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
1211 }
1212
1213 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
1214
1215 return angle::Result::Continue;
1216 }
1217
enqueueNDRangeKernel(const cl::Kernel & kernel,const cl::NDRange & ndrange,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1218 angle::Result CLCommandQueueVk::enqueueNDRangeKernel(const cl::Kernel &kernel,
1219 const cl::NDRange &ndrange,
1220 const cl::EventPtrs &waitEvents,
1221 CLEventImpl::CreateFunc *eventCreateFunc)
1222 {
1223 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1224
1225 ANGLE_TRY(processWaitlist(waitEvents));
1226
1227 vk::PipelineCacheAccess pipelineCache;
1228 vk::PipelineHelper *pipelineHelper = nullptr;
1229 CLKernelVk &kernelImpl = kernel.getImpl<CLKernelVk>();
1230 const CLProgramVk::DeviceProgramData *devProgramData =
1231 kernelImpl.getProgram()->getDeviceProgramData(mCommandQueue.getDevice().getNative());
1232 ASSERT(devProgramData != nullptr);
1233 cl::NDRange enqueueNDRange(ndrange);
1234
1235 // Start with Workgroup size (WGS) from kernel attribute (if available)
1236 cl::WorkgroupSize workgroupSize =
1237 devProgramData->getCompiledWorkgroupSize(kernelImpl.getKernelName());
1238 if (workgroupSize != cl::WorkgroupSize{0, 0, 0})
1239 {
1240 // Local work size (LWS) was valid, use that as WGS
1241 enqueueNDRange.localWorkSize = workgroupSize;
1242 }
1243 else
1244 {
1245 if (enqueueNDRange.nullLocalWorkSize)
1246 {
1247 // NULL value was passed, in which case the OpenCL implementation will determine
1248 // how to be break the global work-items into appropriate work-group instances.
1249 enqueueNDRange.localWorkSize =
1250 mCommandQueue.getDevice().getImpl<CLDeviceVk>().selectWorkGroupSize(enqueueNDRange);
1251 }
1252 // At this point, we should have a non-zero Workgroup size
1253 ASSERT((enqueueNDRange.localWorkSize != cl::WorkgroupSize{0, 0, 0}));
1254 }
1255
1256 // Printf storage is setup for single time usage. So drive any existing usage to completion if
1257 // the kernel uses printf.
1258 if (kernelImpl.usesPrintf() && mNeedPrintfHandling)
1259 {
1260 ANGLE_TRY(finishInternal());
1261 }
1262
1263 // Fetch or create compute pipeline (if we miss in cache)
1264 ANGLE_CL_IMPL_TRY_ERROR(mContext->getRenderer()->getPipelineCache(mContext, &pipelineCache),
1265 CL_OUT_OF_RESOURCES);
1266
1267 ANGLE_TRY(processKernelResources(kernelImpl));
1268 ANGLE_TRY(processGlobalPushConstants(kernelImpl, enqueueNDRange));
1269
1270 // Create uniform dispatch region(s) based on VkLimits for WorkgroupCount
1271 const uint32_t *maxComputeWorkGroupCount =
1272 mContext->getRenderer()->getPhysicalDeviceProperties().limits.maxComputeWorkGroupCount;
1273 for (cl::NDRange &uniformRegion : enqueueNDRange.createUniformRegions(
1274 {maxComputeWorkGroupCount[0], maxComputeWorkGroupCount[1],
1275 maxComputeWorkGroupCount[2]}))
1276 {
1277 cl::WorkgroupCount uniformRegionWorkgroupCount = uniformRegion.getWorkgroupCount();
1278 const VkPushConstantRange *pushConstantRegionOffset =
1279 devProgramData->getRegionOffsetRange();
1280 if (pushConstantRegionOffset != nullptr)
1281 {
1282 // The sum of the global ID offset into the NDRange for this uniform region and
1283 // the global offset of the NDRange
1284 // https://github.com/google/clspv/blob/main/docs/OpenCLCOnVulkan.md#module-scope-push-constants
1285 uint32_t regionOffsets[3] = {
1286 enqueueNDRange.globalWorkOffset[0] + uniformRegion.globalWorkOffset[0],
1287 enqueueNDRange.globalWorkOffset[1] + uniformRegion.globalWorkOffset[1],
1288 enqueueNDRange.globalWorkOffset[2] + uniformRegion.globalWorkOffset[2]};
1289 mComputePassCommands->getCommandBuffer().pushConstants(
1290 kernelImpl.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT,
1291 pushConstantRegionOffset->offset, pushConstantRegionOffset->size, ®ionOffsets);
1292 }
1293 const VkPushConstantRange *pushConstantRegionGroupOffset =
1294 devProgramData->getRegionGroupOffsetRange();
1295 if (pushConstantRegionGroupOffset != nullptr)
1296 {
1297 // The 3D group ID offset into the NDRange for this region
1298 // https://github.com/google/clspv/blob/main/docs/OpenCLCOnVulkan.md#module-scope-push-constants
1299 ASSERT(enqueueNDRange.localWorkSize[0] > 0 && enqueueNDRange.localWorkSize[1] > 0 &&
1300 enqueueNDRange.localWorkSize[2] > 0);
1301 ASSERT(uniformRegion.globalWorkOffset[0] % enqueueNDRange.localWorkSize[0] == 0 &&
1302 uniformRegion.globalWorkOffset[1] % enqueueNDRange.localWorkSize[1] == 0 &&
1303 uniformRegion.globalWorkOffset[2] % enqueueNDRange.localWorkSize[2] == 0);
1304 uint32_t regionGroupOffsets[3] = {
1305 uniformRegion.globalWorkOffset[0] / enqueueNDRange.localWorkSize[0],
1306 uniformRegion.globalWorkOffset[1] / enqueueNDRange.localWorkSize[1],
1307 uniformRegion.globalWorkOffset[2] / enqueueNDRange.localWorkSize[2]};
1308 mComputePassCommands->getCommandBuffer().pushConstants(
1309 kernelImpl.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT,
1310 pushConstantRegionGroupOffset->offset, pushConstantRegionGroupOffset->size,
1311 ®ionGroupOffsets);
1312 }
1313
1314 ANGLE_TRY(kernelImpl.getOrCreateComputePipeline(
1315 &pipelineCache, uniformRegion, mCommandQueue.getDevice(), &pipelineHelper));
1316 mComputePassCommands->retainResource(pipelineHelper);
1317 mComputePassCommands->getCommandBuffer().bindComputePipeline(pipelineHelper->getPipeline());
1318 mComputePassCommands->getCommandBuffer().dispatch(uniformRegionWorkgroupCount[0],
1319 uniformRegionWorkgroupCount[1],
1320 uniformRegionWorkgroupCount[2]);
1321 }
1322
1323 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
1324
1325 return angle::Result::Continue;
1326 }
1327
enqueueTask(const cl::Kernel & kernel,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1328 angle::Result CLCommandQueueVk::enqueueTask(const cl::Kernel &kernel,
1329 const cl::EventPtrs &waitEvents,
1330 CLEventImpl::CreateFunc *eventCreateFunc)
1331 {
1332 constexpr size_t globalWorkSize[3] = {1, 0, 0};
1333 constexpr size_t localWorkSize[3] = {1, 0, 0};
1334 cl::NDRange ndrange(1, nullptr, globalWorkSize, localWorkSize);
1335 return enqueueNDRangeKernel(kernel, ndrange, waitEvents, eventCreateFunc);
1336 }
1337
enqueueNativeKernel(cl::UserFunc userFunc,void * args,size_t cbArgs,const cl::BufferPtrs & buffers,const std::vector<size_t> & bufferPtrOffsets,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1338 angle::Result CLCommandQueueVk::enqueueNativeKernel(cl::UserFunc userFunc,
1339 void *args,
1340 size_t cbArgs,
1341 const cl::BufferPtrs &buffers,
1342 const std::vector<size_t> &bufferPtrOffsets,
1343 const cl::EventPtrs &waitEvents,
1344 CLEventImpl::CreateFunc *eventCreateFunc)
1345 {
1346 UNIMPLEMENTED();
1347 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
1348 }
1349
enqueueMarkerWithWaitList(const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1350 angle::Result CLCommandQueueVk::enqueueMarkerWithWaitList(const cl::EventPtrs &waitEvents,
1351 CLEventImpl::CreateFunc *eventCreateFunc)
1352 {
1353 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1354
1355 ANGLE_TRY(processWaitlist(waitEvents));
1356 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
1357
1358 return angle::Result::Continue;
1359 }
1360
enqueueMarker(CLEventImpl::CreateFunc & eventCreateFunc)1361 angle::Result CLCommandQueueVk::enqueueMarker(CLEventImpl::CreateFunc &eventCreateFunc)
1362 {
1363 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1364
1365 // This deprecated API is essentially a super-set of clEnqueueBarrier, where we also return
1366 // an event object (i.e. marker) since clEnqueueBarrier does not provide this
1367 ANGLE_TRY(insertBarrier());
1368
1369 ANGLE_TRY(createEvent(&eventCreateFunc, cl::ExecutionStatus::Queued));
1370
1371 return angle::Result::Continue;
1372 }
1373
enqueueWaitForEvents(const cl::EventPtrs & events)1374 angle::Result CLCommandQueueVk::enqueueWaitForEvents(const cl::EventPtrs &events)
1375 {
1376 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1377
1378 // Unlike clWaitForEvents, this routine is non-blocking
1379 ANGLE_TRY(processWaitlist(events));
1380
1381 return angle::Result::Continue;
1382 }
1383
enqueueBarrierWithWaitList(const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1384 angle::Result CLCommandQueueVk::enqueueBarrierWithWaitList(const cl::EventPtrs &waitEvents,
1385 CLEventImpl::CreateFunc *eventCreateFunc)
1386 {
1387 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1388
1389 // The barrier command either waits for a list of events to complete, or if the list is
1390 // empty it waits for all commands previously enqueued in command_queue to complete before
1391 // it completes
1392 if (waitEvents.empty())
1393 {
1394 ANGLE_TRY(insertBarrier());
1395 }
1396 else
1397 {
1398 ANGLE_TRY(processWaitlist(waitEvents));
1399 }
1400
1401 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
1402
1403 return angle::Result::Continue;
1404 }
1405
insertBarrier()1406 angle::Result CLCommandQueueVk::insertBarrier()
1407 {
1408 VkMemoryBarrier memoryBarrier = {VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr,
1409 VK_ACCESS_SHADER_WRITE_BIT,
1410 VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT};
1411 mComputePassCommands->getCommandBuffer().pipelineBarrier(
1412 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1,
1413 &memoryBarrier, 0, nullptr, 0, nullptr);
1414
1415 return angle::Result::Continue;
1416 }
1417
enqueueBarrier()1418 angle::Result CLCommandQueueVk::enqueueBarrier()
1419 {
1420 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1421
1422 ANGLE_TRY(insertBarrier());
1423
1424 return angle::Result::Continue;
1425 }
1426
flush()1427 angle::Result CLCommandQueueVk::flush()
1428 {
1429 ANGLE_TRACE_EVENT0("gpu.angle", "CLCommandQueueVk::flush");
1430
1431 QueueSerial lastSubmittedQueueSerial;
1432 {
1433 std::unique_lock<std::mutex> ul(mCommandQueueMutex);
1434
1435 ANGLE_TRY(flushInternal());
1436 lastSubmittedQueueSerial = mLastSubmittedQueueSerial;
1437 }
1438
1439 return mFinishHandler.notify(lastSubmittedQueueSerial);
1440 }
1441
finish()1442 angle::Result CLCommandQueueVk::finish()
1443 {
1444 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1445
1446 ANGLE_TRACE_EVENT0("gpu.angle", "CLCommandQueueVk::finish");
1447
1448 // Blocking finish
1449 return finishInternal();
1450 }
1451
syncHostBuffers(HostTransferEntries & hostTransferList)1452 angle::Result CLCommandQueueVk::syncHostBuffers(HostTransferEntries &hostTransferList)
1453 {
1454 if (!hostTransferList.empty())
1455 {
1456 for (const HostTransferEntry &hostTransferEntry : hostTransferList)
1457 {
1458 const HostTransferConfig &transferConfig = hostTransferEntry.transferConfig;
1459 CLBufferVk &transferBufferVk =
1460 hostTransferEntry.transferBufferHandle->getImpl<CLBufferVk>();
1461 switch (hostTransferEntry.transferConfig.type)
1462 {
1463 case CL_COMMAND_FILL_BUFFER:
1464 case CL_COMMAND_WRITE_BUFFER:
1465 case CL_COMMAND_WRITE_BUFFER_RECT:
1466 // Nothing left to do here
1467 break;
1468 case CL_COMMAND_READ_BUFFER:
1469 case CL_COMMAND_READ_IMAGE:
1470 if (transferConfig.rowPitch == 0 && transferConfig.slicePitch == 0)
1471 {
1472 ANGLE_TRY(transferBufferVk.copyTo(
1473 transferConfig.dstHostPtr, transferConfig.offset, transferConfig.size));
1474 }
1475 else
1476 {
1477 ANGLE_TRY(transferBufferVk.copyToWithPitch(
1478 transferConfig.dstHostPtr, transferConfig.offset, transferConfig.size,
1479 transferConfig.rowPitch, transferConfig.slicePitch,
1480 transferConfig.region, transferConfig.elementSize));
1481 }
1482 break;
1483 case CL_COMMAND_READ_BUFFER_RECT:
1484 ANGLE_TRY(transferBufferVk.getRect(
1485 transferConfig.srcRect, transferConfig.dstRect, transferConfig.dstHostPtr));
1486 break;
1487 default:
1488 UNIMPLEMENTED();
1489 break;
1490 }
1491 }
1492 }
1493 hostTransferList.clear();
1494
1495 return angle::Result::Continue;
1496 }
1497
addMemoryDependencies(cl::Memory * clMem)1498 angle::Result CLCommandQueueVk::addMemoryDependencies(cl::Memory *clMem)
1499 {
1500 cl::Memory *parentMem = clMem->getParent() ? clMem->getParent().get() : nullptr;
1501
1502 // Take an usage count
1503 mCommandsStateMap[mComputePassCommands->getQueueSerial()].memories.emplace_back(clMem);
1504
1505 // Handle possible resource RAW hazard
1506 bool needsBarrier = false;
1507 if (clMem->getFlags().intersects(CL_MEM_READ_WRITE))
1508 {
1509 // Texel buffers have backing buffer objects
1510 if (mDependencyTracker.contains(clMem) || mDependencyTracker.contains(parentMem) ||
1511 mDependencyTracker.size() == kMaxDependencyTrackerSize)
1512 {
1513 needsBarrier = true;
1514 mDependencyTracker.clear();
1515 }
1516 mDependencyTracker.insert(clMem);
1517 if (parentMem)
1518 {
1519 mDependencyTracker.insert(parentMem);
1520 }
1521 }
1522
1523 // Insert a layout transition for images
1524 if (cl::IsImageType(clMem->getType()))
1525 {
1526 CLImageVk &vkMem = clMem->getImpl<CLImageVk>();
1527 mComputePassCommands->imageWrite(mContext, gl::LevelIndex(0), 0, 1,
1528 vkMem.getImage().getAspectFlags(),
1529 vk::ImageLayout::ComputeShaderWrite, &vkMem.getImage());
1530 }
1531 if (needsBarrier)
1532 {
1533 ANGLE_TRY(insertBarrier());
1534 }
1535
1536 return angle::Result::Continue;
1537 }
1538
processKernelResources(CLKernelVk & kernelVk)1539 angle::Result CLCommandQueueVk::processKernelResources(CLKernelVk &kernelVk)
1540 {
1541 bool podBufferPresent = false;
1542 uint32_t podBinding = 0;
1543 VkDescriptorType podDescriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
1544 const CLProgramVk::DeviceProgramData *devProgramData =
1545 kernelVk.getProgram()->getDeviceProgramData(mCommandQueue.getDevice().getNative());
1546 ASSERT(devProgramData != nullptr);
1547
1548 // Set the descriptor set layouts and allocate descriptor sets
1549 // The descriptor set layouts are setup in the order of their appearance, as Vulkan requires
1550 // them to point to valid handles.
1551 angle::EnumIterator<DescriptorSetIndex> layoutIndex(DescriptorSetIndex::LiteralSampler);
1552 for (DescriptorSetIndex index : angle::AllEnums<DescriptorSetIndex>())
1553 {
1554 if (!kernelVk.getDescriptorSetLayoutDesc(index).empty())
1555 {
1556 // Setup the descriptor layout
1557 ANGLE_CL_IMPL_TRY_ERROR(mContext->getDescriptorSetLayoutCache()->getDescriptorSetLayout(
1558 mContext, kernelVk.getDescriptorSetLayoutDesc(index),
1559 &kernelVk.getDescriptorSetLayouts()[*layoutIndex]),
1560 CL_INVALID_OPERATION);
1561 ASSERT(kernelVk.getDescriptorSetLayouts()[*layoutIndex]->valid());
1562
1563 // Allocate descriptor set
1564 ANGLE_TRY(mContext->allocateDescriptorSet(&kernelVk, index, layoutIndex,
1565 mComputePassCommands));
1566 ++layoutIndex;
1567 }
1568 }
1569
1570 // Setup the pipeline layout
1571 ANGLE_CL_IMPL_TRY_ERROR(kernelVk.initPipelineLayout(), CL_INVALID_OPERATION);
1572
1573 // Retain kernel object until we finish executing it later
1574 mCommandsStateMap[mComputePassCommands->getQueueSerial()].kernels.emplace_back(
1575 &kernelVk.getFrontendObject());
1576
1577 // Process descriptor sets used by the kernel
1578 vk::DescriptorSetArray<UpdateDescriptorSetsBuilder> updateDescriptorSetsBuilders;
1579
1580 UpdateDescriptorSetsBuilder &literalSamplerDescSetBuilder =
1581 updateDescriptorSetsBuilders[DescriptorSetIndex::LiteralSampler];
1582
1583 // Create/Setup Literal Sampler
1584 for (const ClspvLiteralSampler &literalSampler : devProgramData->reflectionData.literalSamplers)
1585 {
1586 cl::SamplerPtr clLiteralSampler =
1587 cl::SamplerPtr(cl::Sampler::Cast(this->mContext->getFrontendObject().createSampler(
1588 literalSampler.normalizedCoords, literalSampler.addressingMode,
1589 literalSampler.filterMode)));
1590
1591 // Release immediately to ensure correct refcount
1592 clLiteralSampler->release();
1593 ASSERT(clLiteralSampler != nullptr);
1594 CLSamplerVk &vkLiteralSampler = clLiteralSampler->getImpl<CLSamplerVk>();
1595
1596 VkDescriptorImageInfo &samplerInfo =
1597 literalSamplerDescSetBuilder.allocDescriptorImageInfo();
1598 samplerInfo.sampler = vkLiteralSampler.getSamplerHelper().get().getHandle();
1599 samplerInfo.imageView = VK_NULL_HANDLE;
1600 samplerInfo.imageLayout = VK_IMAGE_LAYOUT_UNDEFINED;
1601
1602 VkWriteDescriptorSet &writeDescriptorSet =
1603 literalSamplerDescSetBuilder.allocWriteDescriptorSet();
1604 writeDescriptorSet.descriptorCount = 1;
1605 writeDescriptorSet.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLER;
1606 writeDescriptorSet.pImageInfo = &samplerInfo;
1607 writeDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1608 writeDescriptorSet.dstSet = kernelVk.getDescriptorSet(DescriptorSetIndex::LiteralSampler);
1609 writeDescriptorSet.dstBinding = literalSampler.binding;
1610
1611 mCommandsStateMap[mComputePassCommands->getQueueSerial()].samplers.emplace_back(
1612 clLiteralSampler);
1613 }
1614
1615 CLKernelArguments args = kernelVk.getArgs();
1616 UpdateDescriptorSetsBuilder &kernelArgDescSetBuilder =
1617 updateDescriptorSetsBuilders[DescriptorSetIndex::KernelArguments];
1618 for (size_t index = 0; index < args.size(); index++)
1619 {
1620 const auto &arg = args.at(index);
1621 switch (arg.type)
1622 {
1623 case NonSemanticClspvReflectionArgumentUniform:
1624 case NonSemanticClspvReflectionArgumentStorageBuffer:
1625 {
1626 cl::Memory *clMem = cl::Buffer::Cast(static_cast<const cl_mem>(arg.handle));
1627 CLBufferVk &vkMem = clMem->getImpl<CLBufferVk>();
1628
1629 ANGLE_TRY(addMemoryDependencies(clMem));
1630
1631 // Update buffer/descriptor info
1632 VkDescriptorBufferInfo &bufferInfo =
1633 kernelArgDescSetBuilder.allocDescriptorBufferInfo();
1634 bufferInfo.range = clMem->getSize();
1635 bufferInfo.offset = clMem->getOffset();
1636 bufferInfo.buffer = vkMem.getBuffer().getBuffer().getHandle();
1637 VkWriteDescriptorSet &writeDescriptorSet =
1638 kernelArgDescSetBuilder.allocWriteDescriptorSet();
1639 writeDescriptorSet.descriptorCount = 1;
1640 writeDescriptorSet.descriptorType =
1641 arg.type == NonSemanticClspvReflectionArgumentUniform
1642 ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER
1643 : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
1644 writeDescriptorSet.pBufferInfo = &bufferInfo;
1645 writeDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1646 writeDescriptorSet.dstSet =
1647 kernelVk.getDescriptorSet(DescriptorSetIndex::KernelArguments);
1648 writeDescriptorSet.dstBinding = arg.descriptorBinding;
1649 break;
1650 }
1651 case NonSemanticClspvReflectionArgumentPodPushConstant:
1652 {
1653 ASSERT(!podBufferPresent);
1654
1655 // Spec requires the size and offset to be multiple of 4, round up for size and
1656 // round down for offset to ensure this
1657 uint32_t offset = roundDownPow2(arg.pushConstOffset, 4u);
1658 uint32_t size =
1659 roundUpPow2(arg.pushConstOffset + arg.pushConstantSize, 4u) - offset;
1660 ASSERT(offset + size <= kernelVk.getPodArgumentPushConstantsData().size());
1661 mComputePassCommands->getCommandBuffer().pushConstants(
1662 kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, offset, size,
1663 &kernelVk.getPodArgumentPushConstantsData()[offset]);
1664 break;
1665 }
1666 case NonSemanticClspvReflectionArgumentWorkgroup:
1667 {
1668 // Nothing to do here (this is already taken care of during clSetKernelArg)
1669 break;
1670 }
1671 case NonSemanticClspvReflectionArgumentSampler:
1672 {
1673 cl::Sampler *clSampler =
1674 cl::Sampler::Cast(*static_cast<const cl_sampler *>(arg.handle));
1675 CLSamplerVk &vkSampler = clSampler->getImpl<CLSamplerVk>();
1676 VkDescriptorImageInfo &samplerInfo =
1677 kernelArgDescSetBuilder.allocDescriptorImageInfo();
1678 samplerInfo.sampler = vkSampler.getSamplerHelper().get().getHandle();
1679 VkWriteDescriptorSet &writeDescriptorSet =
1680 kernelArgDescSetBuilder.allocWriteDescriptorSet();
1681 writeDescriptorSet.descriptorCount = 1;
1682 writeDescriptorSet.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLER;
1683 writeDescriptorSet.pImageInfo = &samplerInfo;
1684 writeDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1685 writeDescriptorSet.dstSet =
1686 kernelVk.getDescriptorSet(DescriptorSetIndex::KernelArguments);
1687 writeDescriptorSet.dstBinding = arg.descriptorBinding;
1688
1689 const VkPushConstantRange *samplerMaskRange =
1690 devProgramData->getNormalizedSamplerMaskRange(index);
1691 if (samplerMaskRange != nullptr)
1692 {
1693 if (clSampler->getNormalizedCoords() == false)
1694 {
1695 ANGLE_TRY(vkSampler.createNormalized());
1696 samplerInfo.sampler =
1697 vkSampler.getSamplerHelperNormalized().get().getHandle();
1698 }
1699 uint32_t mask = vkSampler.getSamplerMask();
1700 mComputePassCommands->getCommandBuffer().pushConstants(
1701 kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT,
1702 samplerMaskRange->offset, samplerMaskRange->size, &mask);
1703 }
1704 break;
1705 }
1706 case NonSemanticClspvReflectionArgumentStorageImage:
1707 case NonSemanticClspvReflectionArgumentSampledImage:
1708 {
1709 cl::Memory *clMem = cl::Image::Cast(static_cast<const cl_mem>(arg.handle));
1710 CLImageVk &vkMem = clMem->getImpl<CLImageVk>();
1711
1712 ANGLE_TRY(addMemoryDependencies(clMem));
1713
1714 cl_image_format imageFormat = vkMem.getFormat();
1715 const VkPushConstantRange *imageDataChannelOrderRange =
1716 devProgramData->getImageDataChannelOrderRange(index);
1717 if (imageDataChannelOrderRange != nullptr)
1718 {
1719 mComputePassCommands->getCommandBuffer().pushConstants(
1720 kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT,
1721 imageDataChannelOrderRange->offset, imageDataChannelOrderRange->size,
1722 &imageFormat.image_channel_order);
1723 }
1724
1725 const VkPushConstantRange *imageDataChannelDataTypeRange =
1726 devProgramData->getImageDataChannelDataTypeRange(index);
1727 if (imageDataChannelDataTypeRange != nullptr)
1728 {
1729 mComputePassCommands->getCommandBuffer().pushConstants(
1730 kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT,
1731 imageDataChannelDataTypeRange->offset, imageDataChannelDataTypeRange->size,
1732 &imageFormat.image_channel_data_type);
1733 }
1734
1735 // Update image/descriptor info
1736 VkDescriptorImageInfo &imageInfo =
1737 kernelArgDescSetBuilder.allocDescriptorImageInfo();
1738 imageInfo.imageLayout = arg.type == NonSemanticClspvReflectionArgumentStorageImage
1739 ? VK_IMAGE_LAYOUT_GENERAL
1740 : vkMem.getImage().getCurrentLayout();
1741 imageInfo.imageView = vkMem.getImageView().getHandle();
1742 imageInfo.sampler = VK_NULL_HANDLE;
1743 VkWriteDescriptorSet &writeDescriptorSet =
1744 kernelArgDescSetBuilder.allocWriteDescriptorSet();
1745 writeDescriptorSet.descriptorCount = 1;
1746 writeDescriptorSet.descriptorType =
1747 arg.type == NonSemanticClspvReflectionArgumentStorageImage
1748 ? VK_DESCRIPTOR_TYPE_STORAGE_IMAGE
1749 : VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE;
1750 writeDescriptorSet.pImageInfo = &imageInfo;
1751 writeDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1752 writeDescriptorSet.dstSet =
1753 kernelVk.getDescriptorSet(DescriptorSetIndex::KernelArguments);
1754 writeDescriptorSet.dstBinding = arg.descriptorBinding;
1755 break;
1756 }
1757 case NonSemanticClspvReflectionArgumentUniformTexelBuffer:
1758 case NonSemanticClspvReflectionArgumentStorageTexelBuffer:
1759 {
1760 cl::Memory *clMem = cl::Image::Cast(static_cast<const cl_mem>(arg.handle));
1761 CLImageVk &vkMem = clMem->getImpl<CLImageVk>();
1762
1763 ANGLE_TRY(addMemoryDependencies(clMem));
1764
1765 VkBufferView &bufferView = kernelArgDescSetBuilder.allocBufferView();
1766 const vk::BufferView *vkBufferView = nullptr;
1767 ANGLE_TRY(vkMem.getBufferView(&vkBufferView));
1768 bufferView = vkBufferView->getHandle();
1769
1770 VkWriteDescriptorSet &writeDescriptorSet =
1771 kernelArgDescSetBuilder.allocWriteDescriptorSet();
1772 writeDescriptorSet.descriptorCount = 1;
1773 writeDescriptorSet.descriptorType =
1774 arg.type == NonSemanticClspvReflectionArgumentStorageTexelBuffer
1775 ? VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER
1776 : VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER;
1777 writeDescriptorSet.pImageInfo = nullptr;
1778 writeDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1779 writeDescriptorSet.dstSet =
1780 kernelVk.getDescriptorSet(DescriptorSetIndex::KernelArguments);
1781 writeDescriptorSet.dstBinding = arg.descriptorBinding;
1782 writeDescriptorSet.pTexelBufferView = &bufferView;
1783
1784 break;
1785 }
1786 case NonSemanticClspvReflectionArgumentPodUniform:
1787 case NonSemanticClspvReflectionArgumentPodStorageBuffer:
1788 {
1789 if (!podBufferPresent)
1790 {
1791 podBufferPresent = true;
1792 podBinding = arg.descriptorBinding;
1793 podDescriptorType = arg.type == NonSemanticClspvReflectionArgumentPodUniform
1794 ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER
1795 : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
1796 }
1797 break;
1798 }
1799 case NonSemanticClspvReflectionArgumentPointerUniform:
1800 case NonSemanticClspvReflectionArgumentPointerPushConstant:
1801 default:
1802 {
1803 UNIMPLEMENTED();
1804 break;
1805 }
1806 }
1807 }
1808 if (podBufferPresent)
1809 {
1810 cl::MemoryPtr clMem = kernelVk.getPodBuffer();
1811 ASSERT(clMem != nullptr);
1812 CLBufferVk &vkMem = clMem->getImpl<CLBufferVk>();
1813
1814 VkDescriptorBufferInfo &bufferInfo = kernelArgDescSetBuilder.allocDescriptorBufferInfo();
1815 bufferInfo.range = clMem->getSize();
1816 bufferInfo.offset = clMem->getOffset();
1817 bufferInfo.buffer = vkMem.getBuffer().getBuffer().getHandle();
1818
1819 ANGLE_TRY(addMemoryDependencies(clMem.get()));
1820
1821 VkWriteDescriptorSet &writeDescriptorSet =
1822 kernelArgDescSetBuilder.allocWriteDescriptorSet();
1823 writeDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1824 writeDescriptorSet.pNext = nullptr;
1825 writeDescriptorSet.dstSet = kernelVk.getDescriptorSet(DescriptorSetIndex::KernelArguments);
1826 writeDescriptorSet.dstBinding = podBinding;
1827 writeDescriptorSet.dstArrayElement = 0;
1828 writeDescriptorSet.descriptorCount = 1;
1829 writeDescriptorSet.descriptorType = podDescriptorType;
1830 writeDescriptorSet.pImageInfo = nullptr;
1831 writeDescriptorSet.pBufferInfo = &bufferInfo;
1832 }
1833
1834 // process the printf storage buffer
1835 if (kernelVk.usesPrintf())
1836 {
1837 UpdateDescriptorSetsBuilder &printfDescSetBuilder =
1838 updateDescriptorSetsBuilders[DescriptorSetIndex::Printf];
1839
1840 cl::MemoryPtr clMem = getOrCreatePrintfBuffer();
1841 CLBufferVk &vkMem = clMem->getImpl<CLBufferVk>();
1842 uint8_t *mapPointer = nullptr;
1843 ANGLE_TRY(vkMem.map(mapPointer, 0));
1844 // The spec calls out *The first 4 bytes of the buffer should be zero-initialized.*
1845 memset(mapPointer, 0, 4);
1846
1847 auto &bufferInfo = printfDescSetBuilder.allocDescriptorBufferInfo();
1848 bufferInfo.range = clMem->getSize();
1849 bufferInfo.offset = clMem->getOffset();
1850 bufferInfo.buffer = vkMem.getBuffer().getBuffer().getHandle();
1851
1852 auto &writeDescriptorSet = printfDescSetBuilder.allocWriteDescriptorSet();
1853 writeDescriptorSet.descriptorCount = 1;
1854 writeDescriptorSet.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
1855 writeDescriptorSet.pBufferInfo = &bufferInfo;
1856 writeDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1857 writeDescriptorSet.dstSet = kernelVk.getDescriptorSet(DescriptorSetIndex::Printf);
1858 writeDescriptorSet.dstBinding = kernelVk.getProgram()
1859 ->getDeviceProgramData(kernelVk.getKernelName().c_str())
1860 ->reflectionData.printfBufferStorage.binding;
1861
1862 mNeedPrintfHandling = true;
1863 mPrintfInfos = kernelVk.getProgram()->getPrintfDescriptors(kernelVk.getKernelName());
1864 }
1865
1866 angle::EnumIterator<DescriptorSetIndex> descriptorSetIndex(DescriptorSetIndex::LiteralSampler);
1867 for (DescriptorSetIndex index : angle::AllEnums<DescriptorSetIndex>())
1868 {
1869 if (!kernelVk.getDescriptorSetLayoutDesc(index).empty())
1870 {
1871 mContext->getPerfCounters().writeDescriptorSets =
1872 updateDescriptorSetsBuilders[index].flushDescriptorSetUpdates(
1873 mContext->getRenderer()->getDevice());
1874
1875 VkDescriptorSet descriptorSet = kernelVk.getDescriptorSet(index);
1876 mComputePassCommands->getCommandBuffer().bindDescriptorSets(
1877 kernelVk.getPipelineLayout(), VK_PIPELINE_BIND_POINT_COMPUTE, *descriptorSetIndex,
1878 1, &descriptorSet, 0, nullptr);
1879
1880 ++descriptorSetIndex;
1881 }
1882 }
1883
1884 return angle::Result::Continue;
1885 }
1886
processGlobalPushConstants(CLKernelVk & kernelVk,const cl::NDRange & ndrange)1887 angle::Result CLCommandQueueVk::processGlobalPushConstants(CLKernelVk &kernelVk,
1888 const cl::NDRange &ndrange)
1889 {
1890 const CLProgramVk::DeviceProgramData *devProgramData =
1891 kernelVk.getProgram()->getDeviceProgramData(mCommandQueue.getDevice().getNative());
1892 ASSERT(devProgramData != nullptr);
1893
1894 const VkPushConstantRange *globalOffsetRange = devProgramData->getGlobalOffsetRange();
1895 if (globalOffsetRange != nullptr)
1896 {
1897 mComputePassCommands->getCommandBuffer().pushConstants(
1898 kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, globalOffsetRange->offset,
1899 globalOffsetRange->size, ndrange.globalWorkOffset.data());
1900 }
1901
1902 const VkPushConstantRange *globalSizeRange = devProgramData->getGlobalSizeRange();
1903 if (globalSizeRange != nullptr)
1904 {
1905 mComputePassCommands->getCommandBuffer().pushConstants(
1906 kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, globalSizeRange->offset,
1907 globalSizeRange->size, ndrange.globalWorkSize.data());
1908 }
1909
1910 const VkPushConstantRange *enqueuedLocalSizeRange = devProgramData->getEnqueuedLocalSizeRange();
1911 if (enqueuedLocalSizeRange != nullptr)
1912 {
1913 mComputePassCommands->getCommandBuffer().pushConstants(
1914 kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT,
1915 enqueuedLocalSizeRange->offset, enqueuedLocalSizeRange->size,
1916 ndrange.localWorkSize.data());
1917 }
1918
1919 const VkPushConstantRange *numWorkgroupsRange = devProgramData->getNumWorkgroupsRange();
1920 if (devProgramData->reflectionData.pushConstants.contains(
1921 NonSemanticClspvReflectionPushConstantNumWorkgroups))
1922 {
1923 // We support non-uniform workgroups, thus take the ceil of the quotient
1924 uint32_t numWorkgroups[3] = {
1925 UnsignedCeilDivide(ndrange.globalWorkSize[0], ndrange.localWorkSize[0]),
1926 UnsignedCeilDivide(ndrange.globalWorkSize[1], ndrange.localWorkSize[1]),
1927 UnsignedCeilDivide(ndrange.globalWorkSize[2], ndrange.localWorkSize[2])};
1928 mComputePassCommands->getCommandBuffer().pushConstants(
1929 kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, numWorkgroupsRange->offset,
1930 numWorkgroupsRange->size, &numWorkgroups);
1931 }
1932
1933 return angle::Result::Continue;
1934 }
1935
flushComputePassCommands()1936 angle::Result CLCommandQueueVk::flushComputePassCommands()
1937 {
1938 if (mComputePassCommands->empty())
1939 {
1940 return angle::Result::Continue;
1941 }
1942
1943 // Flush any host visible buffers by adding appropriate barriers
1944 if (mComputePassCommands->getAndResetHasHostVisibleBufferWrite())
1945 {
1946 // Make sure all writes to host-visible buffers are flushed.
1947 VkMemoryBarrier memoryBarrier = {};
1948 memoryBarrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
1949 memoryBarrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;
1950 memoryBarrier.dstAccessMask = VK_ACCESS_HOST_READ_BIT | VK_ACCESS_HOST_WRITE_BIT;
1951
1952 mComputePassCommands->getCommandBuffer().memoryBarrier(
1953 VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
1954 VK_PIPELINE_STAGE_HOST_BIT, memoryBarrier);
1955 }
1956
1957 // get hold of the queue serial that is flushed, post the flush the command buffer will be reset
1958 mLastFlushedQueueSerial = mComputePassCommands->getQueueSerial();
1959 // Here, we flush our compute cmds to RendererVk's primary command buffer
1960 ANGLE_TRY(mContext->getRenderer()->flushOutsideRPCommands(
1961 mContext, getProtectionType(), egl::ContextPriority::Medium, &mComputePassCommands));
1962
1963 mContext->getPerfCounters().flushedOutsideRenderPassCommandBuffers++;
1964
1965 // Generate new serial for next batch of cmds
1966 mComputePassCommands->setQueueSerial(
1967 mQueueSerialIndex, mContext->getRenderer()->generateQueueSerial(mQueueSerialIndex));
1968
1969 return angle::Result::Continue;
1970 }
1971
processWaitlist(const cl::EventPtrs & waitEvents)1972 angle::Result CLCommandQueueVk::processWaitlist(const cl::EventPtrs &waitEvents)
1973 {
1974 if (!waitEvents.empty())
1975 {
1976 bool insertedBarrier = false;
1977 for (const cl::EventPtr &event : waitEvents)
1978 {
1979 if (event->getImpl<CLEventVk>().isUserEvent() ||
1980 event->getCommandQueue() != &mCommandQueue)
1981 {
1982 // We cannot use a barrier in these cases, therefore defer the event
1983 // handling till submission time
1984 // TODO: Perhaps we could utilize VkEvents here instead and have GPU wait(s)
1985 // https://anglebug.com/42267109
1986 mExternalEvents.push_back(event);
1987 }
1988 else if (event->getCommandQueue() == &mCommandQueue && !insertedBarrier)
1989 {
1990 // As long as there is at least one dependant command in same queue,
1991 // we just need to insert one execution barrier
1992 ANGLE_TRY(insertBarrier());
1993
1994 insertedBarrier = true;
1995 }
1996 }
1997 }
1998 return angle::Result::Continue;
1999 }
2000
submitCommands()2001 angle::Result CLCommandQueueVk::submitCommands()
2002 {
2003 ANGLE_TRACE_EVENT0("gpu.angle", "CLCommandQueueVk::submitCommands()");
2004
2005 ASSERT(hasCommandsPendingSubmission());
2006
2007 // Kick off renderer submit
2008 ANGLE_TRY(mContext->getRenderer()->submitCommands(mContext, getProtectionType(),
2009 egl::ContextPriority::Medium, nullptr,
2010 nullptr, {}, mLastFlushedQueueSerial));
2011
2012 mLastSubmittedQueueSerial = mLastFlushedQueueSerial;
2013
2014 // Now that we have submitted commands, some of pending garbage may no longer pending
2015 // and should be moved to garbage list.
2016 mContext->getRenderer()->cleanupPendingSubmissionGarbage();
2017
2018 return angle::Result::Continue;
2019 }
2020
createEvent(CLEventImpl::CreateFunc * createFunc,cl::ExecutionStatus initialStatus)2021 angle::Result CLCommandQueueVk::createEvent(CLEventImpl::CreateFunc *createFunc,
2022 cl::ExecutionStatus initialStatus)
2023 {
2024 if (createFunc != nullptr)
2025 {
2026 *createFunc = [initialStatus, queueSerial = mComputePassCommands->getQueueSerial()](
2027 const cl::Event &event) {
2028 auto eventVk = new (std::nothrow) CLEventVk(event, initialStatus, queueSerial);
2029 if (eventVk == nullptr)
2030 {
2031 ERR() << "Failed to create cmd event obj!";
2032 return CLEventImpl::Ptr(nullptr);
2033 }
2034 return CLEventImpl::Ptr(eventVk);
2035 };
2036 }
2037 return angle::Result::Continue;
2038 }
2039
submitEmptyCommand()2040 angle::Result CLCommandQueueVk::submitEmptyCommand()
2041 {
2042 // This will be called as part of resetting the command buffer and command buffer has to be
2043 // empty.
2044 ASSERT(mComputePassCommands->empty());
2045
2046 // There is nothing to be flushed, mark it flushed and do a submit to signal the queue serial
2047 mLastFlushedQueueSerial = mComputePassCommands->getQueueSerial();
2048 ANGLE_TRY(submitCommands());
2049 ANGLE_TRY(finishQueueSerialInternal(mLastSubmittedQueueSerial));
2050
2051 // increment the queue serial for the next command batch
2052 mComputePassCommands->setQueueSerial(
2053 mQueueSerialIndex, mContext->getRenderer()->generateQueueSerial(mQueueSerialIndex));
2054
2055 return angle::Result::Continue;
2056 }
2057
resetCommandBufferWithError(cl_int errorCode)2058 angle::Result CLCommandQueueVk::resetCommandBufferWithError(cl_int errorCode)
2059 {
2060 // Got an error so reset the command buffer and report back error to all the associated
2061 // events
2062 ASSERT(errorCode != CL_SUCCESS);
2063
2064 QueueSerial currentSerial = mComputePassCommands->getQueueSerial();
2065 mComputePassCommands->getCommandBuffer().reset();
2066
2067 for (cl::EventPtr event : mCommandsStateMap[currentSerial].events)
2068 {
2069 CLEventVk *eventVk = &event->getImpl<CLEventVk>();
2070 if (!eventVk->isUserEvent())
2071 {
2072 ANGLE_TRY(
2073 eventVk->setStatusAndExecuteCallback(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST));
2074 }
2075 }
2076 mCommandsStateMap.erase(currentSerial);
2077 mExternalEvents.clear();
2078
2079 // Command buffer has been reset and as such the associated queue serial will not get signaled
2080 // leading to causality issues. So submit an empty command to keep the queue serials timelines
2081 // intact.
2082 ANGLE_TRY(submitEmptyCommand());
2083
2084 ANGLE_CL_RETURN_ERROR(errorCode);
2085 }
2086
finishQueueSerialInternal(const QueueSerial queueSerial)2087 angle::Result CLCommandQueueVk::finishQueueSerialInternal(const QueueSerial queueSerial)
2088 {
2089 // Queue serial must belong to this queue and work must have been submitted.
2090 ASSERT(queueSerial.getIndex() == mQueueSerialIndex);
2091 ASSERT(mContext->getRenderer()->hasQueueSerialSubmitted(queueSerial));
2092
2093 ANGLE_TRY(mContext->getRenderer()->finishQueueSerial(mContext, queueSerial));
2094
2095 // Ensure memory objects are synced back to host CPU
2096 ANGLE_TRY(syncHostBuffers(mCommandsStateMap[queueSerial].hostTransferList));
2097
2098 if (mNeedPrintfHandling)
2099 {
2100 ANGLE_TRY(processPrintfBuffer());
2101 mNeedPrintfHandling = false;
2102 }
2103
2104 // Events associated with this queue serial and ready to be marked complete
2105 ANGLE_TRY(SetEventsWithQueueSerialToState(mCommandsStateMap[queueSerial].events, queueSerial,
2106 cl::ExecutionStatus::Complete));
2107
2108 mExternalEvents.clear();
2109 mCommandsStateMap.erase(queueSerial);
2110
2111 return angle::Result::Continue;
2112 }
2113
finishQueueSerial(const QueueSerial queueSerial)2114 angle::Result CLCommandQueueVk::finishQueueSerial(const QueueSerial queueSerial)
2115 {
2116 ASSERT(queueSerial.getIndex() == getQueueSerialIndex());
2117 ASSERT(mContext->getRenderer()->hasQueueSerialSubmitted(queueSerial));
2118
2119 ANGLE_TRY(mContext->getRenderer()->finishQueueSerial(mContext, queueSerial));
2120
2121 std::lock_guard<std::mutex> sl(mCommandQueueMutex);
2122
2123 return finishQueueSerialInternal(queueSerial);
2124 }
2125
flushInternal()2126 angle::Result CLCommandQueueVk::flushInternal()
2127 {
2128 if (!mComputePassCommands->empty())
2129 {
2130 // If we still have dependant events, handle them now
2131 if (!mExternalEvents.empty())
2132 {
2133 for (const auto &depEvent : mExternalEvents)
2134 {
2135 if (depEvent->getImpl<CLEventVk>().isUserEvent())
2136 {
2137 // We just wait here for user to set the event object
2138 cl_int status = CL_QUEUED;
2139 ANGLE_TRY(depEvent->getImpl<CLEventVk>().waitForUserEventStatus());
2140 ANGLE_TRY(depEvent->getImpl<CLEventVk>().getCommandExecutionStatus(status));
2141 if (status < 0)
2142 {
2143 ERR() << "Invalid dependant user-event (" << depEvent.get()
2144 << ") status encountered!";
2145 ANGLE_TRY(resetCommandBufferWithError(
2146 CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST));
2147 }
2148 }
2149 else
2150 {
2151 // Otherwise, we just need to submit/finish for dependant event queues
2152 // here that are not associated with this queue
2153 ANGLE_TRY(depEvent->getCommandQueue()->finish());
2154 }
2155 }
2156 mExternalEvents.clear();
2157 }
2158
2159 ANGLE_TRY(flushComputePassCommands());
2160 CommandsState commandsState = mCommandsStateMap[mLastFlushedQueueSerial];
2161 ANGLE_TRY(SetEventsWithQueueSerialToState(commandsState.events, mLastFlushedQueueSerial,
2162 cl::ExecutionStatus::Submitted));
2163
2164 ANGLE_TRY(submitCommands());
2165 ASSERT(!hasCommandsPendingSubmission());
2166 ANGLE_TRY(SetEventsWithQueueSerialToState(commandsState.events, mLastSubmittedQueueSerial,
2167 cl::ExecutionStatus::Running));
2168 }
2169
2170 return angle::Result::Continue;
2171 }
2172
finishInternal()2173 angle::Result CLCommandQueueVk::finishInternal()
2174 {
2175 ANGLE_TRACE_EVENT0("gpu.angle", "CLCommandQueueVk::finish");
2176 ANGLE_TRY(flushInternal());
2177
2178 return finishQueueSerialInternal(mLastSubmittedQueueSerial);
2179 }
2180
2181 // Helper function to insert appropriate memory barriers before accessing the resources in the
2182 // command buffer.
onResourceAccess(const vk::CommandBufferAccess & access)2183 angle::Result CLCommandQueueVk::onResourceAccess(const vk::CommandBufferAccess &access)
2184 {
2185 // Buffers
2186 for (const vk::CommandBufferBufferAccess &bufferAccess : access.getReadBuffers())
2187 {
2188 if (mComputePassCommands->usesBufferForWrite(*bufferAccess.buffer))
2189 {
2190 // read buffers only need a new command buffer if previously used for write
2191 ANGLE_TRY(flushInternal());
2192 }
2193
2194 mComputePassCommands->bufferRead(mContext, bufferAccess.accessType, bufferAccess.stage,
2195 bufferAccess.buffer);
2196 }
2197
2198 for (const vk::CommandBufferBufferAccess &bufferAccess : access.getWriteBuffers())
2199 {
2200 if (mComputePassCommands->usesBuffer(*bufferAccess.buffer))
2201 {
2202 // write buffers always need a new command buffer
2203 ANGLE_TRY(flushInternal());
2204 }
2205
2206 mComputePassCommands->bufferWrite(mContext, bufferAccess.accessType, bufferAccess.stage,
2207 bufferAccess.buffer);
2208 if (bufferAccess.buffer->isHostVisible())
2209 {
2210 // currently all are host visible so nothing to do
2211 }
2212 }
2213
2214 for (const vk::CommandBufferBufferExternalAcquireRelease &bufferAcquireRelease :
2215 access.getExternalAcquireReleaseBuffers())
2216 {
2217 mComputePassCommands->retainResourceForWrite(bufferAcquireRelease.buffer);
2218 }
2219
2220 for (const vk::CommandBufferResourceAccess &resourceAccess : access.getAccessResources())
2221 {
2222 mComputePassCommands->retainResource(resourceAccess.resource);
2223 }
2224
2225 return angle::Result::Continue;
2226 }
2227
processPrintfBuffer()2228 angle::Result CLCommandQueueVk::processPrintfBuffer()
2229 {
2230 ASSERT(mPrintfBuffer);
2231 ASSERT(mNeedPrintfHandling);
2232 ASSERT(mPrintfInfos);
2233
2234 cl::MemoryPtr clMem = getOrCreatePrintfBuffer();
2235 CLBufferVk &vkMem = clMem->getImpl<CLBufferVk>();
2236
2237 unsigned char *data = nullptr;
2238 ANGLE_TRY(vkMem.map(data, 0));
2239 ANGLE_TRY(ClspvProcessPrintfBuffer(data, vkMem.getSize(), mPrintfInfos));
2240 vkMem.unmap();
2241
2242 return angle::Result::Continue;
2243 }
2244
2245 // A single CL buffer is setup for every command queue of size kPrintfBufferSize. This can be
2246 // expanded later, if more storage is needed.
getOrCreatePrintfBuffer()2247 cl::MemoryPtr CLCommandQueueVk::getOrCreatePrintfBuffer()
2248 {
2249 if (!mPrintfBuffer)
2250 {
2251 mPrintfBuffer = cl::Buffer::Cast(mContext->getFrontendObject().createBuffer(
2252 nullptr, cl::MemFlags(CL_MEM_READ_WRITE), kPrintfBufferSize, nullptr));
2253 }
2254 return cl::MemoryPtr(mPrintfBuffer);
2255 }
2256
hasUserEventDependency() const2257 bool CLCommandQueueVk::hasUserEventDependency() const
2258 {
2259 return std::any_of(mExternalEvents.begin(), mExternalEvents.end(),
2260 [](const cl::EventPtr event) { return event->isUserEvent(); });
2261 }
2262
addEventReference(CLEventVk & eventVk)2263 void CLCommandQueueVk::addEventReference(CLEventVk &eventVk)
2264 {
2265 ASSERT(eventVk.getQueueSerial().valid());
2266 ASSERT(eventVk.getQueueSerial().getIndex() == mQueueSerialIndex);
2267
2268 std::lock_guard<std::mutex> lock(mCommandQueueMutex);
2269
2270 mCommandsStateMap[eventVk.getQueueSerial()].events.emplace_back(&eventVk.getFrontendObject());
2271 }
2272
2273 } // namespace rx
2274