• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright 2021 The ANGLE Project Authors. All rights reserved.
3 // Use of this source code is governed by a BSD-style license that can be
4 // found in the LICENSE file.
5 //
6 // CLCommandQueueVk.cpp: Implements the class methods for CLCommandQueueVk.
7 
8 #include "common/PackedCLEnums_autogen.h"
9 #include "common/system_utils.h"
10 
11 #include "libANGLE/renderer/vulkan/CLCommandQueueVk.h"
12 #include "libANGLE/renderer/vulkan/CLContextVk.h"
13 #include "libANGLE/renderer/vulkan/CLDeviceVk.h"
14 #include "libANGLE/renderer/vulkan/CLEventVk.h"
15 #include "libANGLE/renderer/vulkan/CLKernelVk.h"
16 #include "libANGLE/renderer/vulkan/CLMemoryVk.h"
17 #include "libANGLE/renderer/vulkan/CLProgramVk.h"
18 #include "libANGLE/renderer/vulkan/CLSamplerVk.h"
19 #include "libANGLE/renderer/vulkan/cl_types.h"
20 #include "libANGLE/renderer/vulkan/clspv_utils.h"
21 #include "libANGLE/renderer/vulkan/vk_cache_utils.h"
22 #include "libANGLE/renderer/vulkan/vk_cl_utils.h"
23 #include "libANGLE/renderer/vulkan/vk_helpers.h"
24 #include "libANGLE/renderer/vulkan/vk_renderer.h"
25 #include "libANGLE/renderer/vulkan/vk_wrapper.h"
26 
27 #include "libANGLE/renderer/serial_utils.h"
28 
29 #include "libANGLE/CLBuffer.h"
30 #include "libANGLE/CLCommandQueue.h"
31 #include "libANGLE/CLContext.h"
32 #include "libANGLE/CLEvent.h"
33 #include "libANGLE/CLImage.h"
34 #include "libANGLE/CLKernel.h"
35 #include "libANGLE/CLSampler.h"
36 #include "libANGLE/Error.h"
37 #include "libANGLE/cl_types.h"
38 #include "libANGLE/cl_utils.h"
39 
40 #include "spirv/unified1/NonSemanticClspvReflection.h"
41 #include "vulkan/vulkan_core.h"
42 
43 #include <chrono>
44 
45 namespace rx
46 {
47 
48 namespace
49 {
50 static constexpr size_t kTimeoutInMS            = 10000;
51 static constexpr size_t kSleepInMS              = 500;
52 static constexpr size_t kTimeoutCheckIterations = kTimeoutInMS / kSleepInMS;
53 
SetEventsWithQueueSerialToState(const cl::EventPtrs & eventList,const QueueSerial & queueSerial,cl::ExecutionStatus state)54 angle::Result SetEventsWithQueueSerialToState(const cl::EventPtrs &eventList,
55                                               const QueueSerial &queueSerial,
56                                               cl::ExecutionStatus state)
57 {
58 
59     ASSERT(state < cl::ExecutionStatus::EnumCount);
60 
61     for (cl::EventPtr event : eventList)
62     {
63         CLEventVk *eventVk = &event->getImpl<CLEventVk>();
64         if (!eventVk->isUserEvent() && eventVk->usedByCommandBuffer(queueSerial))
65         {
66             ANGLE_TRY(eventVk->setStatusAndExecuteCallback(cl::ToCLenum(state)));
67         }
68     }
69     return angle::Result::Continue;
70 }
71 
DispatchWorkThread(CLCommandQueueVk * commandQueue)72 DispatchWorkThread::DispatchWorkThread(CLCommandQueueVk *commandQueue)
73     : mCommandQueue(commandQueue),
74       mIsTerminating(false),
75       mQueueSerials(kFixedQueueLimit),
76       mQueueSerialIndex(kInvalidQueueSerialIndex)
77 {}
78 
~DispatchWorkThread()79 DispatchWorkThread::~DispatchWorkThread()
80 {
81     ASSERT(mIsTerminating);
82 }
83 
init()84 angle::Result DispatchWorkThread::init()
85 {
86     mQueueSerialIndex = mCommandQueue->getQueueSerialIndex();
87     ASSERT(mQueueSerialIndex != kInvalidQueueSerialIndex);
88 
89     mWorkerThread = std::thread(&DispatchWorkThread::finishLoop, this);
90 
91     return angle::Result::Continue;
92 }
93 
terminate()94 void DispatchWorkThread::terminate()
95 {
96     // Terminate the background thread
97     {
98         std::unique_lock<std::mutex> lock(mThreadMutex);
99         mIsTerminating = true;
100     }
101     mHasWorkSubmitted.notify_all();
102     if (mWorkerThread.joinable())
103     {
104         mWorkerThread.join();
105     }
106 }
107 
notify(QueueSerial queueSerial)108 angle::Result DispatchWorkThread::notify(QueueSerial queueSerial)
109 {
110     ASSERT(queueSerial.getIndex() == mQueueSerialIndex);
111 
112     // QueueSerials are always received in order, its either same or greater than last one
113     std::unique_lock<std::mutex> ul(mThreadMutex);
114     if (!mQueueSerials.empty())
115     {
116         QueueSerial &lastSerial = mQueueSerials.back();
117         ASSERT(queueSerial >= lastSerial);
118         if (queueSerial == lastSerial)
119         {
120             return angle::Result::Continue;
121         }
122     }
123 
124     // if the queue is full, it might be that device is lost, check for timeout
125     size_t numIterations = 0;
126     while (mQueueSerials.full() && numIterations < kTimeoutCheckIterations)
127     {
128         mHasEmptySlot.wait_for(ul, std::chrono::milliseconds(kSleepInMS),
129                                [this]() { return !mQueueSerials.full(); });
130         numIterations++;
131     }
132     if (numIterations == kTimeoutCheckIterations)
133     {
134         ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
135     }
136 
137     mQueueSerials.push(queueSerial);
138     mHasWorkSubmitted.notify_one();
139 
140     return angle::Result::Continue;
141 }
142 
finishLoop()143 angle::Result DispatchWorkThread::finishLoop()
144 {
145     angle::SetCurrentThreadName("ANGLE-CL-CQD");
146 
147     while (true)
148     {
149         std::unique_lock<std::mutex> ul(mThreadMutex);
150         mHasWorkSubmitted.wait(ul, [this]() { return !mQueueSerials.empty() || mIsTerminating; });
151 
152         while (!mQueueSerials.empty())
153         {
154             QueueSerial queueSerial = mQueueSerials.front();
155             mQueueSerials.pop();
156             mHasEmptySlot.notify_one();
157             ul.unlock();
158             // finish the work associated with the queue serial
159             ANGLE_TRY(mCommandQueue->finishQueueSerial(queueSerial));
160             ul.lock();
161         }
162 
163         if (mIsTerminating)
164         {
165             break;
166         }
167     }
168     return angle::Result::Continue;
169 }
170 
171 }  // namespace
172 
CLCommandQueueVk(const cl::CommandQueue & commandQueue)173 CLCommandQueueVk::CLCommandQueueVk(const cl::CommandQueue &commandQueue)
174     : CLCommandQueueImpl(commandQueue),
175       mContext(&commandQueue.getContext().getImpl<CLContextVk>()),
176       mDevice(&commandQueue.getDevice().getImpl<CLDeviceVk>()),
177       mPrintfBuffer(nullptr),
178       mComputePassCommands(nullptr),
179       mQueueSerialIndex(kInvalidQueueSerialIndex),
180       mNeedPrintfHandling(false),
181       mPrintfInfos(nullptr),
182       mFinishHandler(this)
183 {}
184 
init()185 angle::Result CLCommandQueueVk::init()
186 {
187     vk::Renderer *renderer = mContext->getRenderer();
188     ASSERT(renderer);
189 
190     ANGLE_CL_IMPL_TRY_ERROR(vk::OutsideRenderPassCommandBuffer::InitializeCommandPool(
191                                 mContext, &mCommandPool.outsideRenderPassPool,
192                                 renderer->getQueueFamilyIndex(), getProtectionType()),
193                             CL_OUT_OF_RESOURCES);
194 
195     ANGLE_CL_IMPL_TRY_ERROR(mContext->getRenderer()->getOutsideRenderPassCommandBufferHelper(
196                                 mContext, &mCommandPool.outsideRenderPassPool,
197                                 &mOutsideRenderPassCommandsAllocator, &mComputePassCommands),
198                             CL_OUT_OF_RESOURCES);
199 
200     // Generate initial QueueSerial for command buffer helper
201     ANGLE_CL_IMPL_TRY_ERROR(mContext->getRenderer()->allocateQueueSerialIndex(&mQueueSerialIndex),
202                             CL_OUT_OF_RESOURCES);
203     // and set an initial queue serial for the compute pass commands
204     mComputePassCommands->setQueueSerial(
205         mQueueSerialIndex, mContext->getRenderer()->generateQueueSerial(mQueueSerialIndex));
206 
207     // Initialize serials to be valid but appear submitted and finished.
208     mLastFlushedQueueSerial   = QueueSerial(mQueueSerialIndex, Serial());
209     mLastSubmittedQueueSerial = mLastFlushedQueueSerial;
210 
211     ANGLE_TRY(mFinishHandler.init());
212 
213     return angle::Result::Continue;
214 }
215 
~CLCommandQueueVk()216 CLCommandQueueVk::~CLCommandQueueVk()
217 {
218     mFinishHandler.terminate();
219 
220     ASSERT(mComputePassCommands->empty());
221     ASSERT(!mNeedPrintfHandling);
222 
223     if (mPrintfBuffer)
224     {
225         // The lifetime of printf buffer is scoped to command queue, release and destroy.
226         const bool wasLastUser = mPrintfBuffer->release();
227         ASSERT(wasLastUser);
228         delete mPrintfBuffer;
229     }
230 
231     VkDevice vkDevice = mContext->getDevice();
232 
233     if (mQueueSerialIndex != kInvalidQueueSerialIndex)
234     {
235         mContext->getRenderer()->releaseQueueSerialIndex(mQueueSerialIndex);
236         mQueueSerialIndex = kInvalidQueueSerialIndex;
237     }
238 
239     // Recycle the current command buffers
240     mContext->getRenderer()->recycleOutsideRenderPassCommandBufferHelper(&mComputePassCommands);
241     mCommandPool.outsideRenderPassPool.destroy(vkDevice);
242 }
243 
setProperty(cl::CommandQueueProperties properties,cl_bool enable)244 angle::Result CLCommandQueueVk::setProperty(cl::CommandQueueProperties properties, cl_bool enable)
245 {
246     // NOTE: "clSetCommandQueueProperty" has been deprecated as of OpenCL 1.1
247     // http://man.opencl.org/deprecated.html
248     return angle::Result::Continue;
249 }
250 
enqueueReadBuffer(const cl::Buffer & buffer,bool blocking,size_t offset,size_t size,void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)251 angle::Result CLCommandQueueVk::enqueueReadBuffer(const cl::Buffer &buffer,
252                                                   bool blocking,
253                                                   size_t offset,
254                                                   size_t size,
255                                                   void *ptr,
256                                                   const cl::EventPtrs &waitEvents,
257                                                   CLEventImpl::CreateFunc *eventCreateFunc)
258 {
259     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
260 
261     ANGLE_TRY(processWaitlist(waitEvents));
262     CLBufferVk *bufferVk = &buffer.getImpl<CLBufferVk>();
263 
264     if (blocking)
265     {
266         ANGLE_TRY(finishInternal());
267         ANGLE_TRY(bufferVk->copyTo(ptr, offset, size));
268 
269         ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
270     }
271     else
272     {
273         // Stage a transfer routine
274         HostTransferConfig transferConfig;
275         transferConfig.type       = CL_COMMAND_READ_BUFFER;
276         transferConfig.offset     = offset;
277         transferConfig.size       = size;
278         transferConfig.dstHostPtr = ptr;
279         ANGLE_TRY(addToHostTransferList(bufferVk, transferConfig));
280 
281         ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
282     }
283 
284     return angle::Result::Continue;
285 }
286 
enqueueWriteBuffer(const cl::Buffer & buffer,bool blocking,size_t offset,size_t size,const void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)287 angle::Result CLCommandQueueVk::enqueueWriteBuffer(const cl::Buffer &buffer,
288                                                    bool blocking,
289                                                    size_t offset,
290                                                    size_t size,
291                                                    const void *ptr,
292                                                    const cl::EventPtrs &waitEvents,
293                                                    CLEventImpl::CreateFunc *eventCreateFunc)
294 {
295     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
296 
297     ANGLE_TRY(processWaitlist(waitEvents));
298 
299     auto bufferVk = &buffer.getImpl<CLBufferVk>();
300 
301     if (blocking)
302     {
303         ANGLE_TRY(finishInternal());
304         ANGLE_TRY(bufferVk->copyFrom(ptr, offset, size));
305 
306         ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
307     }
308     else
309     {
310         // Stage a transfer routine
311         HostTransferConfig config;
312         config.type       = CL_COMMAND_WRITE_BUFFER;
313         config.offset     = offset;
314         config.size       = size;
315         config.srcHostPtr = ptr;
316         ANGLE_TRY(addToHostTransferList(bufferVk, config));
317 
318         ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
319     }
320 
321     return angle::Result::Continue;
322 }
323 
enqueueReadBufferRect(const cl::Buffer & buffer,bool blocking,const cl::MemOffsets & bufferOrigin,const cl::MemOffsets & hostOrigin,const cl::Coordinate & region,size_t bufferRowPitch,size_t bufferSlicePitch,size_t hostRowPitch,size_t hostSlicePitch,void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)324 angle::Result CLCommandQueueVk::enqueueReadBufferRect(const cl::Buffer &buffer,
325                                                       bool blocking,
326                                                       const cl::MemOffsets &bufferOrigin,
327                                                       const cl::MemOffsets &hostOrigin,
328                                                       const cl::Coordinate &region,
329                                                       size_t bufferRowPitch,
330                                                       size_t bufferSlicePitch,
331                                                       size_t hostRowPitch,
332                                                       size_t hostSlicePitch,
333                                                       void *ptr,
334                                                       const cl::EventPtrs &waitEvents,
335                                                       CLEventImpl::CreateFunc *eventCreateFunc)
336 {
337     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
338 
339     ANGLE_TRY(processWaitlist(waitEvents));
340     auto bufferVk = &buffer.getImpl<CLBufferVk>();
341 
342     cl::BufferRect bufferRect{cl::Offset{bufferOrigin.x, bufferOrigin.y, bufferOrigin.z},
343                               cl::Extents{region.x, region.y, region.z}, bufferRowPitch,
344                               bufferSlicePitch, 1};
345 
346     cl::BufferRect ptrRect{cl::Offset{hostOrigin.x, hostOrigin.y, hostOrigin.z},
347                            cl::Extents{region.x, region.y, region.z}, hostRowPitch, hostSlicePitch,
348                            1};
349 
350     if (blocking)
351     {
352         ANGLE_TRY(finishInternal());
353         ANGLE_TRY(bufferVk->getRect(bufferRect, ptrRect, ptr));
354     }
355     else
356     {
357         // Stage a transfer routine
358         HostTransferConfig config;
359         config.type       = CL_COMMAND_READ_BUFFER_RECT;
360         config.srcRect    = bufferRect;
361         config.dstRect    = ptrRect;
362         config.dstHostPtr = ptr;
363         config.size       = bufferVk->getSize();
364         ANGLE_TRY(addToHostTransferList(bufferVk, config));
365     }
366 
367     ANGLE_TRY(createEvent(eventCreateFunc,
368                           blocking ? cl::ExecutionStatus::Complete : cl::ExecutionStatus::Queued));
369     return angle::Result::Continue;
370 }
371 
enqueueWriteBufferRect(const cl::Buffer & buffer,bool blocking,const cl::MemOffsets & bufferOrigin,const cl::MemOffsets & hostOrigin,const cl::Coordinate & region,size_t bufferRowPitch,size_t bufferSlicePitch,size_t hostRowPitch,size_t hostSlicePitch,const void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)372 angle::Result CLCommandQueueVk::enqueueWriteBufferRect(const cl::Buffer &buffer,
373                                                        bool blocking,
374                                                        const cl::MemOffsets &bufferOrigin,
375                                                        const cl::MemOffsets &hostOrigin,
376                                                        const cl::Coordinate &region,
377                                                        size_t bufferRowPitch,
378                                                        size_t bufferSlicePitch,
379                                                        size_t hostRowPitch,
380                                                        size_t hostSlicePitch,
381                                                        const void *ptr,
382                                                        const cl::EventPtrs &waitEvents,
383                                                        CLEventImpl::CreateFunc *eventCreateFunc)
384 {
385     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
386 
387     ANGLE_TRY(processWaitlist(waitEvents));
388     auto bufferVk = &buffer.getImpl<CLBufferVk>();
389 
390     cl::BufferRect bufferRect{cl::Offset{bufferOrigin.x, bufferOrigin.y, bufferOrigin.z},
391                               cl::Extents{region.x, region.y, region.z}, bufferRowPitch,
392                               bufferSlicePitch, 1};
393 
394     cl::BufferRect ptrRect{cl::Offset{hostOrigin.x, hostOrigin.y, hostOrigin.z},
395                            cl::Extents{region.x, region.y, region.z}, hostRowPitch, hostSlicePitch,
396                            1};
397 
398     if (blocking)
399     {
400         ANGLE_TRY(finishInternal());
401         ANGLE_TRY(bufferVk->setRect(ptr, ptrRect, bufferRect));
402     }
403     else
404     {
405         // Stage a transfer routine
406         HostTransferConfig config;
407         config.type       = CL_COMMAND_WRITE_BUFFER_RECT;
408         config.srcRect    = ptrRect;
409         config.dstRect    = bufferRect;
410         config.srcHostPtr = ptr;
411         config.size       = bufferVk->getSize();
412         ANGLE_TRY(addToHostTransferList(bufferVk, config));
413     }
414 
415     ANGLE_TRY(createEvent(eventCreateFunc,
416                           blocking ? cl::ExecutionStatus::Complete : cl::ExecutionStatus::Queued));
417     return angle::Result::Continue;
418 }
419 
enqueueCopyBuffer(const cl::Buffer & srcBuffer,const cl::Buffer & dstBuffer,size_t srcOffset,size_t dstOffset,size_t size,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)420 angle::Result CLCommandQueueVk::enqueueCopyBuffer(const cl::Buffer &srcBuffer,
421                                                   const cl::Buffer &dstBuffer,
422                                                   size_t srcOffset,
423                                                   size_t dstOffset,
424                                                   size_t size,
425                                                   const cl::EventPtrs &waitEvents,
426                                                   CLEventImpl::CreateFunc *eventCreateFunc)
427 {
428     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
429 
430     ANGLE_TRY(processWaitlist(waitEvents));
431 
432     CLBufferVk *srcBufferVk = &srcBuffer.getImpl<CLBufferVk>();
433     CLBufferVk *dstBufferVk = &dstBuffer.getImpl<CLBufferVk>();
434 
435     vk::CommandBufferAccess access;
436     if (srcBufferVk->isSubBuffer() && dstBufferVk->isSubBuffer() &&
437         (srcBufferVk->getParent() == dstBufferVk->getParent()))
438     {
439         // this is a self copy
440         access.onBufferSelfCopy(&srcBufferVk->getBuffer());
441     }
442     else
443     {
444         access.onBufferTransferRead(&srcBufferVk->getBuffer());
445         access.onBufferTransferWrite(&dstBufferVk->getBuffer());
446     }
447 
448     vk::OutsideRenderPassCommandBuffer *commandBuffer;
449     ANGLE_TRY(getCommandBuffer(access, &commandBuffer));
450 
451     VkBufferCopy copyRegion = {srcOffset, dstOffset, size};
452     // update the offset in the case of sub-buffers
453     if (srcBufferVk->getOffset())
454     {
455         copyRegion.srcOffset += srcBufferVk->getOffset();
456     }
457     if (dstBufferVk->getOffset())
458     {
459         copyRegion.dstOffset += dstBufferVk->getOffset();
460     }
461     commandBuffer->copyBuffer(srcBufferVk->getBuffer().getBuffer(),
462                               dstBufferVk->getBuffer().getBuffer(), 1, &copyRegion);
463 
464     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
465 
466     return angle::Result::Continue;
467 }
468 
enqueueCopyBufferRect(const cl::Buffer & srcBuffer,const cl::Buffer & dstBuffer,const cl::MemOffsets & srcOrigin,const cl::MemOffsets & dstOrigin,const cl::Coordinate & region,size_t srcRowPitch,size_t srcSlicePitch,size_t dstRowPitch,size_t dstSlicePitch,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)469 angle::Result CLCommandQueueVk::enqueueCopyBufferRect(const cl::Buffer &srcBuffer,
470                                                       const cl::Buffer &dstBuffer,
471                                                       const cl::MemOffsets &srcOrigin,
472                                                       const cl::MemOffsets &dstOrigin,
473                                                       const cl::Coordinate &region,
474                                                       size_t srcRowPitch,
475                                                       size_t srcSlicePitch,
476                                                       size_t dstRowPitch,
477                                                       size_t dstSlicePitch,
478                                                       const cl::EventPtrs &waitEvents,
479                                                       CLEventImpl::CreateFunc *eventCreateFunc)
480 {
481     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
482     ANGLE_TRY(processWaitlist(waitEvents));
483     ANGLE_TRY(finishInternal());
484 
485     cl::BufferRect srcRect{cl::Offset{srcOrigin.x, srcOrigin.y, srcOrigin.z},
486                            cl::Extents{region.x, region.y, region.z}, srcRowPitch, srcSlicePitch,
487                            1};
488 
489     cl::BufferRect dstRect{cl::Offset{dstOrigin.x, dstOrigin.y, dstOrigin.z},
490                            cl::Extents{region.x, region.y, region.z}, dstRowPitch, dstSlicePitch,
491                            1};
492 
493     auto srcBufferVk    = &srcBuffer.getImpl<CLBufferVk>();
494     auto dstBufferVk    = &dstBuffer.getImpl<CLBufferVk>();
495     uint8_t *mapPointer = nullptr;
496     ANGLE_TRY(srcBufferVk->map(mapPointer));
497     ASSERT(mapPointer);
498     ANGLE_TRY(dstBufferVk->setRect(static_cast<const void *>(mapPointer), srcRect, dstRect));
499 
500     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
501     return angle::Result::Continue;
502 }
503 
enqueueFillBuffer(const cl::Buffer & buffer,const void * pattern,size_t patternSize,size_t offset,size_t size,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)504 angle::Result CLCommandQueueVk::enqueueFillBuffer(const cl::Buffer &buffer,
505                                                   const void *pattern,
506                                                   size_t patternSize,
507                                                   size_t offset,
508                                                   size_t size,
509                                                   const cl::EventPtrs &waitEvents,
510                                                   CLEventImpl::CreateFunc *eventCreateFunc)
511 {
512     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
513 
514     ANGLE_TRY(processWaitlist(waitEvents));
515 
516     CLBufferVk *bufferVk = &buffer.getImpl<CLBufferVk>();
517 
518     // Stage a transfer routine
519     HostTransferConfig config;
520     config.type        = CL_COMMAND_FILL_BUFFER;
521     config.patternSize = patternSize;
522     config.offset      = offset;
523     config.size        = size;
524     config.srcHostPtr  = pattern;
525     ANGLE_TRY(addToHostTransferList(bufferVk, config));
526 
527     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
528 
529     return angle::Result::Continue;
530 }
531 
enqueueMapBuffer(const cl::Buffer & buffer,bool blocking,cl::MapFlags mapFlags,size_t offset,size_t size,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc,void * & mapPtr)532 angle::Result CLCommandQueueVk::enqueueMapBuffer(const cl::Buffer &buffer,
533                                                  bool blocking,
534                                                  cl::MapFlags mapFlags,
535                                                  size_t offset,
536                                                  size_t size,
537                                                  const cl::EventPtrs &waitEvents,
538                                                  CLEventImpl::CreateFunc *eventCreateFunc,
539                                                  void *&mapPtr)
540 {
541     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
542 
543     ANGLE_TRY(processWaitlist(waitEvents));
544 
545     cl::ExecutionStatus eventComplete = cl::ExecutionStatus::Queued;
546     if (blocking || !eventCreateFunc)
547     {
548         ANGLE_TRY(finishInternal());
549         eventComplete = cl::ExecutionStatus::Complete;
550     }
551 
552     CLBufferVk *bufferVk = &buffer.getImpl<CLBufferVk>();
553     uint8_t *mapPointer  = nullptr;
554     if (buffer.getFlags().intersects(CL_MEM_USE_HOST_PTR))
555     {
556         ANGLE_TRY(finishInternal());
557         mapPointer = static_cast<uint8_t *>(buffer.getHostPtr()) + offset;
558         ANGLE_TRY(bufferVk->copyTo(mapPointer, offset, size));
559         eventComplete = cl::ExecutionStatus::Complete;
560     }
561     else
562     {
563         ANGLE_TRY(bufferVk->map(mapPointer, offset));
564     }
565     mapPtr = static_cast<void *>(mapPointer);
566 
567     if (bufferVk->isCurrentlyInUse())
568     {
569         eventComplete = cl::ExecutionStatus::Queued;
570     }
571     ANGLE_TRY(createEvent(eventCreateFunc, eventComplete));
572 
573     return angle::Result::Continue;
574 }
575 
copyImageToFromBuffer(CLImageVk & imageVk,vk::BufferHelper & buffer,const cl::MemOffsets & origin,const cl::Coordinate & region,size_t bufferOffset,ImageBufferCopyDirection direction)576 angle::Result CLCommandQueueVk::copyImageToFromBuffer(CLImageVk &imageVk,
577                                                       vk::BufferHelper &buffer,
578                                                       const cl::MemOffsets &origin,
579                                                       const cl::Coordinate &region,
580                                                       size_t bufferOffset,
581                                                       ImageBufferCopyDirection direction)
582 {
583     vk::CommandBufferAccess access;
584     vk::OutsideRenderPassCommandBuffer *commandBuffer;
585     VkImageAspectFlags aspectFlags = imageVk.getImage().getAspectFlags();
586     if (direction == ImageBufferCopyDirection::ToBuffer)
587     {
588         access.onImageTransferRead(aspectFlags, &imageVk.getImage());
589         access.onBufferTransferWrite(&buffer);
590     }
591     else
592     {
593         access.onImageTransferWrite(gl::LevelIndex(0), 1, 0,
594                                     static_cast<uint32_t>(imageVk.getArraySize()), aspectFlags,
595                                     &imageVk.getImage());
596         access.onBufferTransferRead(&buffer);
597     }
598     ANGLE_TRY(getCommandBuffer(access, &commandBuffer));
599 
600     VkBufferImageCopy copyRegion = {};
601     copyRegion.bufferOffset      = bufferOffset;
602     copyRegion.bufferRowLength   = 0;
603     copyRegion.bufferImageHeight = 0;
604     copyRegion.imageExtent       = cl_vk::GetExtent(imageVk.getExtentForCopy(region));
605     copyRegion.imageOffset       = cl_vk::GetOffset(imageVk.getOffsetForCopy(origin));
606     copyRegion.imageSubresource  = imageVk.getSubresourceLayersForCopy(
607         origin, region, imageVk.getType(), ImageCopyWith::Buffer);
608     if (imageVk.isWritable())
609     {
610         // We need an execution barrier if image can be written to by kernel
611         ANGLE_TRY(insertBarrier());
612     }
613 
614     VkMemoryBarrier memBarrier = {};
615     memBarrier.sType           = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
616     memBarrier.srcAccessMask   = VK_ACCESS_MEMORY_WRITE_BIT;
617     memBarrier.dstAccessMask   = VK_ACCESS_MEMORY_READ_BIT;
618     if (direction == ImageBufferCopyDirection::ToBuffer)
619     {
620         commandBuffer->copyImageToBuffer(imageVk.getImage().getImage(),
621                                          VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
622                                          buffer.getBuffer().getHandle(), 1, &copyRegion);
623 
624         mComputePassCommands->getCommandBuffer().pipelineBarrier(
625             VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0, 1, &memBarrier, 0,
626             nullptr, 0, nullptr);
627     }
628     else
629     {
630         commandBuffer->copyBufferToImage(buffer.getBuffer().getHandle(),
631                                          imageVk.getImage().getImage(),
632                                          VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &copyRegion);
633 
634         mComputePassCommands->getCommandBuffer().pipelineBarrier(
635             VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memBarrier,
636             0, nullptr, 0, nullptr);
637     }
638 
639     return angle::Result::Continue;
640 }
641 
addToHostTransferList(CLBufferVk * srcBuffer,HostTransferConfig transferConfig)642 angle::Result CLCommandQueueVk::addToHostTransferList(CLBufferVk *srcBuffer,
643                                                       HostTransferConfig transferConfig)
644 {
645     // TODO(aannestrand): Flush here if we reach some max-transfer-buffer heuristic
646     // http://anglebug.com/377545840
647 
648     cl::Memory *transferBufferHandle =
649         cl::Buffer::Cast(this->mContext->getFrontendObject().createBuffer(
650             nullptr, cl::MemFlags{CL_MEM_READ_WRITE}, srcBuffer->getSize(), nullptr));
651     if (transferBufferHandle == nullptr)
652     {
653         ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
654     }
655     HostTransferEntry transferEntry{transferConfig, cl::MemoryPtr{transferBufferHandle}};
656     mCommandsStateMap[mComputePassCommands->getQueueSerial()].hostTransferList.emplace_back(
657         transferEntry);
658 
659     // Release initialization reference, lifetime controlled by RefPointer.
660     transferBufferHandle->release();
661 
662     // We need an execution barrier if buffer can be written to by kernel
663     if (!mComputePassCommands->getCommandBuffer().empty() && srcBuffer->isWritable())
664     {
665         // TODO(aannestrand): Look into combining these kernel execution barriers
666         // http://anglebug.com/377545840
667         VkMemoryBarrier memoryBarrier = {VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr,
668                                          VK_ACCESS_SHADER_WRITE_BIT,
669                                          VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT};
670         mComputePassCommands->getCommandBuffer().pipelineBarrier(
671             VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1,
672             &memoryBarrier, 0, nullptr, 0, nullptr);
673     }
674 
675     // Enqueue blit/transfer cmd
676     VkPipelineStageFlags srcStageMask  = {};
677     VkPipelineStageFlags dstStageMask  = {};
678     VkMemoryBarrier memBarrier         = {};
679     memBarrier.sType                   = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
680     CLBufferVk &transferBufferHandleVk = transferBufferHandle->getImpl<CLBufferVk>();
681     switch (transferConfig.type)
682     {
683         case CL_COMMAND_WRITE_BUFFER:
684         {
685             VkBufferCopy copyRegion = {transferConfig.offset, transferConfig.offset,
686                                        transferConfig.size};
687             ANGLE_TRY(transferBufferHandleVk.copyFrom(transferConfig.srcHostPtr,
688                                                       transferConfig.offset, transferConfig.size));
689             copyRegion.srcOffset += transferBufferHandleVk.getOffset();
690             copyRegion.dstOffset += srcBuffer->getOffset();
691             mComputePassCommands->getCommandBuffer().copyBuffer(
692                 transferBufferHandleVk.getBuffer().getBuffer(), srcBuffer->getBuffer().getBuffer(),
693                 1, &copyRegion);
694 
695             srcStageMask             = VK_PIPELINE_STAGE_TRANSFER_BIT;
696             dstStageMask             = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
697             memBarrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;
698             memBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT;
699             break;
700         }
701         case CL_COMMAND_WRITE_BUFFER_RECT:
702         {
703             ANGLE_TRY(transferBufferHandleVk.setRect(
704                 transferConfig.srcHostPtr, transferConfig.srcRect, transferConfig.dstRect));
705             for (VkBufferCopy &copyRegion :
706                  transferBufferHandleVk.rectCopyRegions(transferConfig.dstRect))
707             {
708                 copyRegion.srcOffset += transferBufferHandleVk.getOffset();
709                 copyRegion.dstOffset += srcBuffer->getOffset();
710                 mComputePassCommands->getCommandBuffer().copyBuffer(
711                     transferBufferHandleVk.getBuffer().getBuffer(),
712                     srcBuffer->getBuffer().getBuffer(), 1, &copyRegion);
713             }
714 
715             // Config transfer barrier
716             srcStageMask             = VK_PIPELINE_STAGE_TRANSFER_BIT;
717             dstStageMask             = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
718             memBarrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;
719             memBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT;
720             break;
721         }
722         case CL_COMMAND_READ_BUFFER:
723         {
724             VkBufferCopy copyRegion = {transferConfig.offset, transferConfig.offset,
725                                        transferConfig.size};
726             copyRegion.srcOffset += srcBuffer->getOffset();
727             copyRegion.dstOffset += transferBufferHandleVk.getOffset();
728             mComputePassCommands->getCommandBuffer().copyBuffer(
729                 srcBuffer->getBuffer().getBuffer(), transferBufferHandleVk.getBuffer().getBuffer(),
730                 1, &copyRegion);
731 
732             srcStageMask             = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
733             dstStageMask             = VK_PIPELINE_STAGE_HOST_BIT;
734             memBarrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;
735             memBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT;
736             break;
737         }
738         case CL_COMMAND_READ_BUFFER_RECT:
739         {
740             for (VkBufferCopy &copyRegion :
741                  transferBufferHandleVk.rectCopyRegions(transferConfig.srcRect))
742             {
743                 copyRegion.srcOffset += srcBuffer->getOffset();
744                 copyRegion.dstOffset += transferBufferHandleVk.getOffset();
745                 mComputePassCommands->getCommandBuffer().copyBuffer(
746                     srcBuffer->getBuffer().getBuffer(),
747                     transferBufferHandleVk.getBuffer().getBuffer(), 1, &copyRegion);
748             }
749 
750             // Config transfer barrier
751             srcStageMask             = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
752             dstStageMask             = VK_PIPELINE_STAGE_HOST_BIT;
753             memBarrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;
754             memBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT;
755             break;
756         }
757         case CL_COMMAND_FILL_BUFFER:
758         {
759             VkBufferCopy copyRegion = {transferConfig.offset, transferConfig.offset,
760                                        transferConfig.size};
761             ANGLE_TRY(transferBufferHandleVk.fillWithPattern(
762                 transferConfig.srcHostPtr, transferConfig.patternSize, transferConfig.offset,
763                 transferConfig.size));
764             copyRegion.srcOffset += transferBufferHandleVk.getOffset();
765             copyRegion.dstOffset += srcBuffer->getOffset();
766             mComputePassCommands->getCommandBuffer().copyBuffer(
767                 transferBufferHandleVk.getBuffer().getBuffer(), srcBuffer->getBuffer().getBuffer(),
768                 1, &copyRegion);
769 
770             // Config transfer barrier
771             srcStageMask             = VK_PIPELINE_STAGE_TRANSFER_BIT;
772             dstStageMask             = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
773             memBarrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;
774             memBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT;
775             break;
776         }
777         default:
778             UNIMPLEMENTED();
779             break;
780     }
781 
782     // TODO(aannestrand): Look into combining these transfer barriers
783     // http://anglebug.com/377545840
784     mComputePassCommands->getCommandBuffer().pipelineBarrier(srcStageMask, dstStageMask, 0, 1,
785                                                              &memBarrier, 0, nullptr, 0, nullptr);
786 
787     return angle::Result::Continue;
788 }
789 
addToHostTransferList(CLImageVk * srcImage,HostTransferConfig transferConfig)790 angle::Result CLCommandQueueVk::addToHostTransferList(CLImageVk *srcImage,
791                                                       HostTransferConfig transferConfig)
792 {
793     // TODO(aannestrand): Flush here if we reach some max-transfer-buffer heuristic
794     // http://anglebug.com/377545840
795     CommandsState &commandsState = mCommandsStateMap[mComputePassCommands->getQueueSerial()];
796 
797     cl::Memory *transferBufferHandle =
798         cl::Buffer::Cast(this->mContext->getFrontendObject().createBuffer(
799             nullptr, cl::MemFlags{CL_MEM_READ_WRITE}, srcImage->getSize(), nullptr));
800     if (transferBufferHandle == nullptr)
801     {
802         ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
803     }
804 
805     HostTransferEntry transferEntry{transferConfig, cl::MemoryPtr{transferBufferHandle}};
806     commandsState.hostTransferList.emplace_back(transferEntry);
807 
808     // Release initialization reference, lifetime controlled by RefPointer.
809     transferBufferHandle->release();
810 
811     // Enqueue blit
812     CLBufferVk &transferBufferHandleVk = transferBufferHandle->getImpl<CLBufferVk>();
813     ANGLE_TRY(copyImageToFromBuffer(*srcImage, transferBufferHandleVk.getBuffer(),
814                                     transferConfig.origin, transferConfig.region, 0,
815                                     ImageBufferCopyDirection::ToBuffer));
816 
817     return angle::Result::Continue;
818 }
819 
enqueueReadImage(const cl::Image & image,bool blocking,const cl::MemOffsets & origin,const cl::Coordinate & region,size_t rowPitch,size_t slicePitch,void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)820 angle::Result CLCommandQueueVk::enqueueReadImage(const cl::Image &image,
821                                                  bool blocking,
822                                                  const cl::MemOffsets &origin,
823                                                  const cl::Coordinate &region,
824                                                  size_t rowPitch,
825                                                  size_t slicePitch,
826                                                  void *ptr,
827                                                  const cl::EventPtrs &waitEvents,
828                                                  CLEventImpl::CreateFunc *eventCreateFunc)
829 {
830     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
831     CLImageVk &imageVk = image.getImpl<CLImageVk>();
832     size_t size        = (region.x * region.y * region.z * imageVk.getElementSize());
833 
834     ANGLE_TRY(processWaitlist(waitEvents));
835 
836     if (imageVk.isStagingBufferInitialized() == false)
837     {
838         ANGLE_TRY(imageVk.createStagingBuffer(imageVk.getSize()));
839     }
840 
841     if (blocking)
842     {
843         ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), origin, region, 0,
844                                         ImageBufferCopyDirection::ToBuffer));
845         ANGLE_TRY(finishInternal());
846         if (rowPitch == 0 && slicePitch == 0)
847         {
848             ANGLE_TRY(imageVk.copyStagingTo(ptr, 0, size));
849         }
850         else
851         {
852             ANGLE_TRY(imageVk.copyStagingToFromWithPitch(ptr, region, rowPitch, slicePitch,
853                                                          StagingBufferCopyDirection::ToHost));
854         }
855         ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
856     }
857     else
858     {
859         // Create a transfer buffer and push it in update list
860         HostTransferConfig transferConfig;
861         transferConfig.type        = CL_COMMAND_READ_IMAGE;
862         transferConfig.size        = size;
863         transferConfig.dstHostPtr  = ptr;
864         transferConfig.origin      = origin;
865         transferConfig.region      = region;
866         transferConfig.rowPitch    = rowPitch;
867         transferConfig.slicePitch  = slicePitch;
868         transferConfig.elementSize = imageVk.getElementSize();
869         ANGLE_TRY(addToHostTransferList(&imageVk, transferConfig));
870 
871         ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
872     }
873 
874     return angle::Result::Continue;
875 }
876 
enqueueWriteImage(const cl::Image & image,bool blocking,const cl::MemOffsets & origin,const cl::Coordinate & region,size_t inputRowPitch,size_t inputSlicePitch,const void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)877 angle::Result CLCommandQueueVk::enqueueWriteImage(const cl::Image &image,
878                                                   bool blocking,
879                                                   const cl::MemOffsets &origin,
880                                                   const cl::Coordinate &region,
881                                                   size_t inputRowPitch,
882                                                   size_t inputSlicePitch,
883                                                   const void *ptr,
884                                                   const cl::EventPtrs &waitEvents,
885                                                   CLEventImpl::CreateFunc *eventCreateFunc)
886 {
887     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
888     ANGLE_TRY(processWaitlist(waitEvents));
889 
890     CLImageVk &imageVk = image.getImpl<CLImageVk>();
891     size_t size        = (region.x * region.y * region.z * imageVk.getElementSize());
892     cl::ExecutionStatus eventInitialState = cl::ExecutionStatus::Queued;
893     if (imageVk.isStagingBufferInitialized() == false)
894     {
895         ANGLE_TRY(imageVk.createStagingBuffer(imageVk.getSize()));
896     }
897 
898     if (inputRowPitch == 0 && inputSlicePitch == 0)
899     {
900         ANGLE_TRY(imageVk.copyStagingFrom((void *)ptr, 0, size));
901     }
902     else
903     {
904         ANGLE_TRY(imageVk.copyStagingToFromWithPitch((void *)ptr, region, inputRowPitch,
905                                                      inputSlicePitch,
906                                                      StagingBufferCopyDirection::ToStagingBuffer));
907     }
908 
909     ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), origin, region, 0,
910                                     ImageBufferCopyDirection::ToImage));
911 
912     if (blocking)
913     {
914         ANGLE_TRY(finishInternal());
915         eventInitialState = cl::ExecutionStatus::Complete;
916     }
917 
918     ANGLE_TRY(createEvent(eventCreateFunc, eventInitialState));
919 
920     return angle::Result::Continue;
921 }
922 
enqueueCopyImage(const cl::Image & srcImage,const cl::Image & dstImage,const cl::MemOffsets & srcOrigin,const cl::MemOffsets & dstOrigin,const cl::Coordinate & region,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)923 angle::Result CLCommandQueueVk::enqueueCopyImage(const cl::Image &srcImage,
924                                                  const cl::Image &dstImage,
925                                                  const cl::MemOffsets &srcOrigin,
926                                                  const cl::MemOffsets &dstOrigin,
927                                                  const cl::Coordinate &region,
928                                                  const cl::EventPtrs &waitEvents,
929                                                  CLEventImpl::CreateFunc *eventCreateFunc)
930 {
931     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
932     ANGLE_TRY(processWaitlist(waitEvents));
933 
934     auto srcImageVk = &srcImage.getImpl<CLImageVk>();
935     auto dstImageVk = &dstImage.getImpl<CLImageVk>();
936 
937     vk::CommandBufferAccess access;
938     vk::OutsideRenderPassCommandBuffer *commandBuffer;
939     VkImageAspectFlags dstAspectFlags = srcImageVk->getImage().getAspectFlags();
940     VkImageAspectFlags srcAspectFlags = dstImageVk->getImage().getAspectFlags();
941     access.onImageTransferWrite(gl::LevelIndex(0), 1, 0, 1, dstAspectFlags,
942                                 &dstImageVk->getImage());
943     access.onImageTransferRead(srcAspectFlags, &srcImageVk->getImage());
944     ANGLE_TRY(getCommandBuffer(access, &commandBuffer));
945 
946     VkImageCopy copyRegion    = {};
947     copyRegion.extent         = cl_vk::GetExtent(srcImageVk->getExtentForCopy(region));
948     copyRegion.srcOffset      = cl_vk::GetOffset(srcImageVk->getOffsetForCopy(srcOrigin));
949     copyRegion.dstOffset      = cl_vk::GetOffset(dstImageVk->getOffsetForCopy(dstOrigin));
950     copyRegion.srcSubresource = srcImageVk->getSubresourceLayersForCopy(
951         srcOrigin, region, dstImageVk->getType(), ImageCopyWith::Image);
952     copyRegion.dstSubresource = dstImageVk->getSubresourceLayersForCopy(
953         dstOrigin, region, srcImageVk->getType(), ImageCopyWith::Image);
954     if (srcImageVk->isWritable() || dstImageVk->isWritable())
955     {
956         // We need an execution barrier if buffer can be written to by kernel
957         ANGLE_TRY(insertBarrier());
958     }
959 
960     commandBuffer->copyImage(
961         srcImageVk->getImage().getImage(), VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
962         dstImageVk->getImage().getImage(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &copyRegion);
963 
964     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
965 
966     return angle::Result::Continue;
967 }
968 
enqueueFillImage(const cl::Image & image,const void * fillColor,const cl::MemOffsets & origin,const cl::Coordinate & region,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)969 angle::Result CLCommandQueueVk::enqueueFillImage(const cl::Image &image,
970                                                  const void *fillColor,
971                                                  const cl::MemOffsets &origin,
972                                                  const cl::Coordinate &region,
973                                                  const cl::EventPtrs &waitEvents,
974                                                  CLEventImpl::CreateFunc *eventCreateFunc)
975 {
976     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
977 
978     ANGLE_TRY(processWaitlist(waitEvents));
979 
980     CLImageVk &imageVk = image.getImpl<CLImageVk>();
981     PixelColor packedColor;
982     cl::Extents extent = imageVk.getImageExtent();
983 
984     imageVk.packPixels(fillColor, &packedColor);
985 
986     if (imageVk.isStagingBufferInitialized() == false)
987     {
988         ANGLE_TRY(imageVk.createStagingBuffer(imageVk.getSize()));
989     }
990 
991     ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), cl::kMemOffsetsZero,
992                                     {extent.width, extent.height, extent.depth}, 0,
993                                     ImageBufferCopyDirection::ToBuffer));
994     ANGLE_TRY(finishInternal());
995 
996     uint8_t *mapPointer = nullptr;
997     ANGLE_TRY(imageVk.map(mapPointer, 0));
998     imageVk.fillImageWithColor(origin, region, mapPointer, &packedColor);
999     imageVk.unmap();
1000     mapPointer = nullptr;
1001     ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), cl::kMemOffsetsZero,
1002                                     {extent.width, extent.height, extent.depth}, 0,
1003                                     ImageBufferCopyDirection::ToImage));
1004 
1005     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
1006 
1007     return angle::Result::Continue;
1008 }
1009 
enqueueCopyImageToBuffer(const cl::Image & srcImage,const cl::Buffer & dstBuffer,const cl::MemOffsets & srcOrigin,const cl::Coordinate & region,size_t dstOffset,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1010 angle::Result CLCommandQueueVk::enqueueCopyImageToBuffer(const cl::Image &srcImage,
1011                                                          const cl::Buffer &dstBuffer,
1012                                                          const cl::MemOffsets &srcOrigin,
1013                                                          const cl::Coordinate &region,
1014                                                          size_t dstOffset,
1015                                                          const cl::EventPtrs &waitEvents,
1016                                                          CLEventImpl::CreateFunc *eventCreateFunc)
1017 {
1018     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1019     CLImageVk &srcImageVk   = srcImage.getImpl<CLImageVk>();
1020     CLBufferVk &dstBufferVk = dstBuffer.getImpl<CLBufferVk>();
1021 
1022     ANGLE_TRY(processWaitlist(waitEvents));
1023 
1024     ANGLE_TRY(copyImageToFromBuffer(srcImageVk, dstBufferVk.getBuffer(), srcOrigin, region,
1025                                     dstOffset, ImageBufferCopyDirection::ToBuffer));
1026 
1027     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
1028 
1029     return angle::Result::Continue;
1030 }
1031 
enqueueCopyBufferToImage(const cl::Buffer & srcBuffer,const cl::Image & dstImage,size_t srcOffset,const cl::MemOffsets & dstOrigin,const cl::Coordinate & region,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1032 angle::Result CLCommandQueueVk::enqueueCopyBufferToImage(const cl::Buffer &srcBuffer,
1033                                                          const cl::Image &dstImage,
1034                                                          size_t srcOffset,
1035                                                          const cl::MemOffsets &dstOrigin,
1036                                                          const cl::Coordinate &region,
1037                                                          const cl::EventPtrs &waitEvents,
1038                                                          CLEventImpl::CreateFunc *eventCreateFunc)
1039 {
1040     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1041     CLBufferVk &srcBufferVk = srcBuffer.getImpl<CLBufferVk>();
1042     CLImageVk &dstImageVk   = dstImage.getImpl<CLImageVk>();
1043 
1044     ANGLE_TRY(processWaitlist(waitEvents));
1045 
1046     ANGLE_TRY(copyImageToFromBuffer(dstImageVk, srcBufferVk.getBuffer(), dstOrigin, region,
1047                                     srcOffset, ImageBufferCopyDirection::ToImage));
1048 
1049     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
1050 
1051     return angle::Result::Continue;
1052 }
1053 
enqueueMapImage(const cl::Image & image,bool blocking,cl::MapFlags mapFlags,const cl::MemOffsets & origin,const cl::Coordinate & region,size_t * imageRowPitch,size_t * imageSlicePitch,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc,void * & mapPtr)1054 angle::Result CLCommandQueueVk::enqueueMapImage(const cl::Image &image,
1055                                                 bool blocking,
1056                                                 cl::MapFlags mapFlags,
1057                                                 const cl::MemOffsets &origin,
1058                                                 const cl::Coordinate &region,
1059                                                 size_t *imageRowPitch,
1060                                                 size_t *imageSlicePitch,
1061                                                 const cl::EventPtrs &waitEvents,
1062                                                 CLEventImpl::CreateFunc *eventCreateFunc,
1063                                                 void *&mapPtr)
1064 {
1065     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1066 
1067     ANGLE_TRY(processWaitlist(waitEvents));
1068 
1069     // TODO: Look into better enqueue handling of this map-op if non-blocking
1070     // https://anglebug.com/376722715
1071     CLImageVk *imageVk = &image.getImpl<CLImageVk>();
1072     cl::Extents extent = imageVk->getImageExtent();
1073     if (blocking)
1074     {
1075         ANGLE_TRY(finishInternal());
1076     }
1077 
1078     mComputePassCommands->imageRead(mContext, imageVk->getImage().getAspectFlags(),
1079                                     vk::ImageLayout::TransferSrc, &imageVk->getImage());
1080 
1081     if (imageVk->isStagingBufferInitialized() == false)
1082     {
1083         ANGLE_TRY(imageVk->createStagingBuffer(imageVk->getSize()));
1084     }
1085 
1086     ANGLE_TRY(copyImageToFromBuffer(*imageVk, imageVk->getStagingBuffer(), cl::kMemOffsetsZero,
1087                                     {extent.width, extent.height, extent.depth}, 0,
1088                                     ImageBufferCopyDirection::ToBuffer));
1089     if (blocking)
1090     {
1091         ANGLE_TRY(finishInternal());
1092     }
1093 
1094     uint8_t *mapPointer = nullptr;
1095     size_t elementSize  = imageVk->getElementSize();
1096     size_t rowPitch     = (extent.width * elementSize);
1097     size_t offset =
1098         (origin.x * elementSize) + (origin.y * rowPitch) + (origin.z * extent.height * rowPitch);
1099     size_t size = (region.x * region.y * region.z * elementSize);
1100 
1101     if (image.getFlags().intersects(CL_MEM_USE_HOST_PTR))
1102     {
1103         mapPointer = static_cast<uint8_t *>(image.getHostPtr()) + offset;
1104         ANGLE_TRY(imageVk->copyTo(mapPointer, offset, size));
1105     }
1106     else
1107     {
1108         ANGLE_TRY(imageVk->map(mapPointer, offset));
1109     }
1110     mapPtr = static_cast<void *>(mapPointer);
1111 
1112     *imageRowPitch = rowPitch;
1113 
1114     switch (imageVk->getDescriptor().type)
1115     {
1116         case cl::MemObjectType::Image1D:
1117         case cl::MemObjectType::Image1D_Buffer:
1118         case cl::MemObjectType::Image2D:
1119             if (imageSlicePitch != nullptr)
1120             {
1121                 *imageSlicePitch = 0;
1122             }
1123             break;
1124         case cl::MemObjectType::Image2D_Array:
1125         case cl::MemObjectType::Image3D:
1126             *imageSlicePitch = (extent.height * (*imageRowPitch));
1127             break;
1128         case cl::MemObjectType::Image1D_Array:
1129             *imageSlicePitch = *imageRowPitch;
1130             break;
1131         default:
1132             UNREACHABLE();
1133             break;
1134     }
1135 
1136     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
1137 
1138     return angle::Result::Continue;
1139 }
1140 
enqueueUnmapMemObject(const cl::Memory & memory,void * mappedPtr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1141 angle::Result CLCommandQueueVk::enqueueUnmapMemObject(const cl::Memory &memory,
1142                                                       void *mappedPtr,
1143                                                       const cl::EventPtrs &waitEvents,
1144                                                       CLEventImpl::CreateFunc *eventCreateFunc)
1145 {
1146     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1147 
1148     ANGLE_TRY(processWaitlist(waitEvents));
1149 
1150     cl::ExecutionStatus eventComplete = cl::ExecutionStatus::Queued;
1151     if (!eventCreateFunc)
1152     {
1153         ANGLE_TRY(finishInternal());
1154         eventComplete = cl::ExecutionStatus::Complete;
1155     }
1156 
1157     if (memory.getType() == cl::MemObjectType::Buffer)
1158     {
1159         CLBufferVk &bufferVk = memory.getImpl<CLBufferVk>();
1160         if (memory.getFlags().intersects(CL_MEM_USE_HOST_PTR))
1161         {
1162             ANGLE_TRY(finishInternal());
1163             ANGLE_TRY(bufferVk.copyFrom(memory.getHostPtr(), 0, bufferVk.getSize()));
1164             eventComplete = cl::ExecutionStatus::Complete;
1165         }
1166     }
1167     else if (memory.getType() != cl::MemObjectType::Pipe)
1168     {
1169         // of image type
1170         CLImageVk &imageVk = memory.getImpl<CLImageVk>();
1171         if (memory.getFlags().intersects(CL_MEM_USE_HOST_PTR))
1172         {
1173             uint8_t *mapPointer = static_cast<uint8_t *>(memory.getHostPtr());
1174             ANGLE_TRY(imageVk.copyStagingFrom(mapPointer, 0, imageVk.getSize()));
1175         }
1176         cl::Extents extent = imageVk.getImageExtent();
1177         ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), cl::kMemOffsetsZero,
1178                                         {extent.width, extent.height, extent.depth}, 0,
1179                                         ImageBufferCopyDirection::ToImage));
1180         ANGLE_TRY(finishInternal());
1181         eventComplete = cl::ExecutionStatus::Complete;
1182     }
1183     else
1184     {
1185         // mem object type pipe is not supported and creation of such an object should have
1186         // failed
1187         UNREACHABLE();
1188     }
1189 
1190     memory.getImpl<CLMemoryVk>().unmap();
1191     ANGLE_TRY(createEvent(eventCreateFunc, eventComplete));
1192 
1193     return angle::Result::Continue;
1194 }
1195 
enqueueMigrateMemObjects(const cl::MemoryPtrs & memObjects,cl::MemMigrationFlags flags,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1196 angle::Result CLCommandQueueVk::enqueueMigrateMemObjects(const cl::MemoryPtrs &memObjects,
1197                                                          cl::MemMigrationFlags flags,
1198                                                          const cl::EventPtrs &waitEvents,
1199                                                          CLEventImpl::CreateFunc *eventCreateFunc)
1200 {
1201     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1202 
1203     ANGLE_TRY(processWaitlist(waitEvents));
1204 
1205     if (mCommandQueue.getContext().getDevices().size() > 1)
1206     {
1207         // TODO(aannestrand): Later implement support to allow migration of mem objects across
1208         // different devices. http://anglebug.com/377942759
1209         UNIMPLEMENTED();
1210         ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
1211     }
1212 
1213     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
1214 
1215     return angle::Result::Continue;
1216 }
1217 
enqueueNDRangeKernel(const cl::Kernel & kernel,const cl::NDRange & ndrange,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1218 angle::Result CLCommandQueueVk::enqueueNDRangeKernel(const cl::Kernel &kernel,
1219                                                      const cl::NDRange &ndrange,
1220                                                      const cl::EventPtrs &waitEvents,
1221                                                      CLEventImpl::CreateFunc *eventCreateFunc)
1222 {
1223     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1224 
1225     ANGLE_TRY(processWaitlist(waitEvents));
1226 
1227     vk::PipelineCacheAccess pipelineCache;
1228     vk::PipelineHelper *pipelineHelper = nullptr;
1229     CLKernelVk &kernelImpl             = kernel.getImpl<CLKernelVk>();
1230     const CLProgramVk::DeviceProgramData *devProgramData =
1231         kernelImpl.getProgram()->getDeviceProgramData(mCommandQueue.getDevice().getNative());
1232     ASSERT(devProgramData != nullptr);
1233     cl::NDRange enqueueNDRange(ndrange);
1234 
1235     // Start with Workgroup size (WGS) from kernel attribute (if available)
1236     cl::WorkgroupSize workgroupSize =
1237         devProgramData->getCompiledWorkgroupSize(kernelImpl.getKernelName());
1238     if (workgroupSize != cl::WorkgroupSize{0, 0, 0})
1239     {
1240         // Local work size (LWS) was valid, use that as WGS
1241         enqueueNDRange.localWorkSize = workgroupSize;
1242     }
1243     else
1244     {
1245         if (enqueueNDRange.nullLocalWorkSize)
1246         {
1247             // NULL value was passed, in which case the OpenCL implementation will determine
1248             // how to be break the global work-items into appropriate work-group instances.
1249             enqueueNDRange.localWorkSize =
1250                 mCommandQueue.getDevice().getImpl<CLDeviceVk>().selectWorkGroupSize(enqueueNDRange);
1251         }
1252         // At this point, we should have a non-zero Workgroup size
1253         ASSERT((enqueueNDRange.localWorkSize != cl::WorkgroupSize{0, 0, 0}));
1254     }
1255 
1256     // Printf storage is setup for single time usage. So drive any existing usage to completion if
1257     // the kernel uses printf.
1258     if (kernelImpl.usesPrintf() && mNeedPrintfHandling)
1259     {
1260         ANGLE_TRY(finishInternal());
1261     }
1262 
1263     // Fetch or create compute pipeline (if we miss in cache)
1264     ANGLE_CL_IMPL_TRY_ERROR(mContext->getRenderer()->getPipelineCache(mContext, &pipelineCache),
1265                             CL_OUT_OF_RESOURCES);
1266 
1267     ANGLE_TRY(processKernelResources(kernelImpl));
1268     ANGLE_TRY(processGlobalPushConstants(kernelImpl, enqueueNDRange));
1269 
1270     // Create uniform dispatch region(s) based on VkLimits for WorkgroupCount
1271     const uint32_t *maxComputeWorkGroupCount =
1272         mContext->getRenderer()->getPhysicalDeviceProperties().limits.maxComputeWorkGroupCount;
1273     for (cl::NDRange &uniformRegion : enqueueNDRange.createUniformRegions(
1274              {maxComputeWorkGroupCount[0], maxComputeWorkGroupCount[1],
1275               maxComputeWorkGroupCount[2]}))
1276     {
1277         cl::WorkgroupCount uniformRegionWorkgroupCount = uniformRegion.getWorkgroupCount();
1278         const VkPushConstantRange *pushConstantRegionOffset =
1279             devProgramData->getRegionOffsetRange();
1280         if (pushConstantRegionOffset != nullptr)
1281         {
1282             // The sum of the global ID offset into the NDRange for this uniform region and
1283             // the global offset of the NDRange
1284             // https://github.com/google/clspv/blob/main/docs/OpenCLCOnVulkan.md#module-scope-push-constants
1285             uint32_t regionOffsets[3] = {
1286                 enqueueNDRange.globalWorkOffset[0] + uniformRegion.globalWorkOffset[0],
1287                 enqueueNDRange.globalWorkOffset[1] + uniformRegion.globalWorkOffset[1],
1288                 enqueueNDRange.globalWorkOffset[2] + uniformRegion.globalWorkOffset[2]};
1289             mComputePassCommands->getCommandBuffer().pushConstants(
1290                 kernelImpl.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT,
1291                 pushConstantRegionOffset->offset, pushConstantRegionOffset->size, &regionOffsets);
1292         }
1293         const VkPushConstantRange *pushConstantRegionGroupOffset =
1294             devProgramData->getRegionGroupOffsetRange();
1295         if (pushConstantRegionGroupOffset != nullptr)
1296         {
1297             // The 3D group ID offset into the NDRange for this region
1298             // https://github.com/google/clspv/blob/main/docs/OpenCLCOnVulkan.md#module-scope-push-constants
1299             ASSERT(enqueueNDRange.localWorkSize[0] > 0 && enqueueNDRange.localWorkSize[1] > 0 &&
1300                    enqueueNDRange.localWorkSize[2] > 0);
1301             ASSERT(uniformRegion.globalWorkOffset[0] % enqueueNDRange.localWorkSize[0] == 0 &&
1302                    uniformRegion.globalWorkOffset[1] % enqueueNDRange.localWorkSize[1] == 0 &&
1303                    uniformRegion.globalWorkOffset[2] % enqueueNDRange.localWorkSize[2] == 0);
1304             uint32_t regionGroupOffsets[3] = {
1305                 uniformRegion.globalWorkOffset[0] / enqueueNDRange.localWorkSize[0],
1306                 uniformRegion.globalWorkOffset[1] / enqueueNDRange.localWorkSize[1],
1307                 uniformRegion.globalWorkOffset[2] / enqueueNDRange.localWorkSize[2]};
1308             mComputePassCommands->getCommandBuffer().pushConstants(
1309                 kernelImpl.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT,
1310                 pushConstantRegionGroupOffset->offset, pushConstantRegionGroupOffset->size,
1311                 &regionGroupOffsets);
1312         }
1313 
1314         ANGLE_TRY(kernelImpl.getOrCreateComputePipeline(
1315             &pipelineCache, uniformRegion, mCommandQueue.getDevice(), &pipelineHelper));
1316         mComputePassCommands->retainResource(pipelineHelper);
1317         mComputePassCommands->getCommandBuffer().bindComputePipeline(pipelineHelper->getPipeline());
1318         mComputePassCommands->getCommandBuffer().dispatch(uniformRegionWorkgroupCount[0],
1319                                                           uniformRegionWorkgroupCount[1],
1320                                                           uniformRegionWorkgroupCount[2]);
1321     }
1322 
1323     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
1324 
1325     return angle::Result::Continue;
1326 }
1327 
enqueueTask(const cl::Kernel & kernel,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1328 angle::Result CLCommandQueueVk::enqueueTask(const cl::Kernel &kernel,
1329                                             const cl::EventPtrs &waitEvents,
1330                                             CLEventImpl::CreateFunc *eventCreateFunc)
1331 {
1332     constexpr size_t globalWorkSize[3] = {1, 0, 0};
1333     constexpr size_t localWorkSize[3]  = {1, 0, 0};
1334     cl::NDRange ndrange(1, nullptr, globalWorkSize, localWorkSize);
1335     return enqueueNDRangeKernel(kernel, ndrange, waitEvents, eventCreateFunc);
1336 }
1337 
enqueueNativeKernel(cl::UserFunc userFunc,void * args,size_t cbArgs,const cl::BufferPtrs & buffers,const std::vector<size_t> & bufferPtrOffsets,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1338 angle::Result CLCommandQueueVk::enqueueNativeKernel(cl::UserFunc userFunc,
1339                                                     void *args,
1340                                                     size_t cbArgs,
1341                                                     const cl::BufferPtrs &buffers,
1342                                                     const std::vector<size_t> &bufferPtrOffsets,
1343                                                     const cl::EventPtrs &waitEvents,
1344                                                     CLEventImpl::CreateFunc *eventCreateFunc)
1345 {
1346     UNIMPLEMENTED();
1347     ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
1348 }
1349 
enqueueMarkerWithWaitList(const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1350 angle::Result CLCommandQueueVk::enqueueMarkerWithWaitList(const cl::EventPtrs &waitEvents,
1351                                                           CLEventImpl::CreateFunc *eventCreateFunc)
1352 {
1353     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1354 
1355     ANGLE_TRY(processWaitlist(waitEvents));
1356     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
1357 
1358     return angle::Result::Continue;
1359 }
1360 
enqueueMarker(CLEventImpl::CreateFunc & eventCreateFunc)1361 angle::Result CLCommandQueueVk::enqueueMarker(CLEventImpl::CreateFunc &eventCreateFunc)
1362 {
1363     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1364 
1365     // This deprecated API is essentially a super-set of clEnqueueBarrier, where we also return
1366     // an event object (i.e. marker) since clEnqueueBarrier does not provide this
1367     ANGLE_TRY(insertBarrier());
1368 
1369     ANGLE_TRY(createEvent(&eventCreateFunc, cl::ExecutionStatus::Queued));
1370 
1371     return angle::Result::Continue;
1372 }
1373 
enqueueWaitForEvents(const cl::EventPtrs & events)1374 angle::Result CLCommandQueueVk::enqueueWaitForEvents(const cl::EventPtrs &events)
1375 {
1376     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1377 
1378     // Unlike clWaitForEvents, this routine is non-blocking
1379     ANGLE_TRY(processWaitlist(events));
1380 
1381     return angle::Result::Continue;
1382 }
1383 
enqueueBarrierWithWaitList(const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1384 angle::Result CLCommandQueueVk::enqueueBarrierWithWaitList(const cl::EventPtrs &waitEvents,
1385                                                            CLEventImpl::CreateFunc *eventCreateFunc)
1386 {
1387     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1388 
1389     // The barrier command either waits for a list of events to complete, or if the list is
1390     // empty it waits for all commands previously enqueued in command_queue to complete before
1391     // it completes
1392     if (waitEvents.empty())
1393     {
1394         ANGLE_TRY(insertBarrier());
1395     }
1396     else
1397     {
1398         ANGLE_TRY(processWaitlist(waitEvents));
1399     }
1400 
1401     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
1402 
1403     return angle::Result::Continue;
1404 }
1405 
insertBarrier()1406 angle::Result CLCommandQueueVk::insertBarrier()
1407 {
1408     VkMemoryBarrier memoryBarrier = {VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr,
1409                                      VK_ACCESS_SHADER_WRITE_BIT,
1410                                      VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT};
1411     mComputePassCommands->getCommandBuffer().pipelineBarrier(
1412         VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1,
1413         &memoryBarrier, 0, nullptr, 0, nullptr);
1414 
1415     return angle::Result::Continue;
1416 }
1417 
enqueueBarrier()1418 angle::Result CLCommandQueueVk::enqueueBarrier()
1419 {
1420     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1421 
1422     ANGLE_TRY(insertBarrier());
1423 
1424     return angle::Result::Continue;
1425 }
1426 
flush()1427 angle::Result CLCommandQueueVk::flush()
1428 {
1429     ANGLE_TRACE_EVENT0("gpu.angle", "CLCommandQueueVk::flush");
1430 
1431     QueueSerial lastSubmittedQueueSerial;
1432     {
1433         std::unique_lock<std::mutex> ul(mCommandQueueMutex);
1434 
1435         ANGLE_TRY(flushInternal());
1436         lastSubmittedQueueSerial = mLastSubmittedQueueSerial;
1437     }
1438 
1439     return mFinishHandler.notify(lastSubmittedQueueSerial);
1440 }
1441 
finish()1442 angle::Result CLCommandQueueVk::finish()
1443 {
1444     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1445 
1446     ANGLE_TRACE_EVENT0("gpu.angle", "CLCommandQueueVk::finish");
1447 
1448     // Blocking finish
1449     return finishInternal();
1450 }
1451 
syncHostBuffers(HostTransferEntries & hostTransferList)1452 angle::Result CLCommandQueueVk::syncHostBuffers(HostTransferEntries &hostTransferList)
1453 {
1454     if (!hostTransferList.empty())
1455     {
1456         for (const HostTransferEntry &hostTransferEntry : hostTransferList)
1457         {
1458             const HostTransferConfig &transferConfig = hostTransferEntry.transferConfig;
1459             CLBufferVk &transferBufferVk =
1460                 hostTransferEntry.transferBufferHandle->getImpl<CLBufferVk>();
1461             switch (hostTransferEntry.transferConfig.type)
1462             {
1463                 case CL_COMMAND_FILL_BUFFER:
1464                 case CL_COMMAND_WRITE_BUFFER:
1465                 case CL_COMMAND_WRITE_BUFFER_RECT:
1466                     // Nothing left to do here
1467                     break;
1468                 case CL_COMMAND_READ_BUFFER:
1469                 case CL_COMMAND_READ_IMAGE:
1470                     if (transferConfig.rowPitch == 0 && transferConfig.slicePitch == 0)
1471                     {
1472                         ANGLE_TRY(transferBufferVk.copyTo(
1473                             transferConfig.dstHostPtr, transferConfig.offset, transferConfig.size));
1474                     }
1475                     else
1476                     {
1477                         ANGLE_TRY(transferBufferVk.copyToWithPitch(
1478                             transferConfig.dstHostPtr, transferConfig.offset, transferConfig.size,
1479                             transferConfig.rowPitch, transferConfig.slicePitch,
1480                             transferConfig.region, transferConfig.elementSize));
1481                     }
1482                     break;
1483                 case CL_COMMAND_READ_BUFFER_RECT:
1484                     ANGLE_TRY(transferBufferVk.getRect(
1485                         transferConfig.srcRect, transferConfig.dstRect, transferConfig.dstHostPtr));
1486                     break;
1487                 default:
1488                     UNIMPLEMENTED();
1489                     break;
1490             }
1491         }
1492     }
1493     hostTransferList.clear();
1494 
1495     return angle::Result::Continue;
1496 }
1497 
addMemoryDependencies(cl::Memory * clMem)1498 angle::Result CLCommandQueueVk::addMemoryDependencies(cl::Memory *clMem)
1499 {
1500     cl::Memory *parentMem = clMem->getParent() ? clMem->getParent().get() : nullptr;
1501 
1502     // Take an usage count
1503     mCommandsStateMap[mComputePassCommands->getQueueSerial()].memories.emplace_back(clMem);
1504 
1505     // Handle possible resource RAW hazard
1506     bool needsBarrier = false;
1507     if (clMem->getFlags().intersects(CL_MEM_READ_WRITE))
1508     {
1509         // Texel buffers have backing buffer objects
1510         if (mDependencyTracker.contains(clMem) || mDependencyTracker.contains(parentMem) ||
1511             mDependencyTracker.size() == kMaxDependencyTrackerSize)
1512         {
1513             needsBarrier = true;
1514             mDependencyTracker.clear();
1515         }
1516         mDependencyTracker.insert(clMem);
1517         if (parentMem)
1518         {
1519             mDependencyTracker.insert(parentMem);
1520         }
1521     }
1522 
1523     // Insert a layout transition for images
1524     if (cl::IsImageType(clMem->getType()))
1525     {
1526         CLImageVk &vkMem = clMem->getImpl<CLImageVk>();
1527         mComputePassCommands->imageWrite(mContext, gl::LevelIndex(0), 0, 1,
1528                                          vkMem.getImage().getAspectFlags(),
1529                                          vk::ImageLayout::ComputeShaderWrite, &vkMem.getImage());
1530     }
1531     if (needsBarrier)
1532     {
1533         ANGLE_TRY(insertBarrier());
1534     }
1535 
1536     return angle::Result::Continue;
1537 }
1538 
processKernelResources(CLKernelVk & kernelVk)1539 angle::Result CLCommandQueueVk::processKernelResources(CLKernelVk &kernelVk)
1540 {
1541     bool podBufferPresent              = false;
1542     uint32_t podBinding                = 0;
1543     VkDescriptorType podDescriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
1544     const CLProgramVk::DeviceProgramData *devProgramData =
1545         kernelVk.getProgram()->getDeviceProgramData(mCommandQueue.getDevice().getNative());
1546     ASSERT(devProgramData != nullptr);
1547 
1548     // Set the descriptor set layouts and allocate descriptor sets
1549     // The descriptor set layouts are setup in the order of their appearance, as Vulkan requires
1550     // them to point to valid handles.
1551     angle::EnumIterator<DescriptorSetIndex> layoutIndex(DescriptorSetIndex::LiteralSampler);
1552     for (DescriptorSetIndex index : angle::AllEnums<DescriptorSetIndex>())
1553     {
1554         if (!kernelVk.getDescriptorSetLayoutDesc(index).empty())
1555         {
1556             // Setup the descriptor layout
1557             ANGLE_CL_IMPL_TRY_ERROR(mContext->getDescriptorSetLayoutCache()->getDescriptorSetLayout(
1558                                         mContext, kernelVk.getDescriptorSetLayoutDesc(index),
1559                                         &kernelVk.getDescriptorSetLayouts()[*layoutIndex]),
1560                                     CL_INVALID_OPERATION);
1561             ASSERT(kernelVk.getDescriptorSetLayouts()[*layoutIndex]->valid());
1562 
1563             // Allocate descriptor set
1564             ANGLE_TRY(mContext->allocateDescriptorSet(&kernelVk, index, layoutIndex,
1565                                                       mComputePassCommands));
1566             ++layoutIndex;
1567         }
1568     }
1569 
1570     // Setup the pipeline layout
1571     ANGLE_CL_IMPL_TRY_ERROR(kernelVk.initPipelineLayout(), CL_INVALID_OPERATION);
1572 
1573     // Retain kernel object until we finish executing it later
1574     mCommandsStateMap[mComputePassCommands->getQueueSerial()].kernels.emplace_back(
1575         &kernelVk.getFrontendObject());
1576 
1577     // Process descriptor sets used by the kernel
1578     vk::DescriptorSetArray<UpdateDescriptorSetsBuilder> updateDescriptorSetsBuilders;
1579 
1580     UpdateDescriptorSetsBuilder &literalSamplerDescSetBuilder =
1581         updateDescriptorSetsBuilders[DescriptorSetIndex::LiteralSampler];
1582 
1583     // Create/Setup Literal Sampler
1584     for (const ClspvLiteralSampler &literalSampler : devProgramData->reflectionData.literalSamplers)
1585     {
1586         cl::SamplerPtr clLiteralSampler =
1587             cl::SamplerPtr(cl::Sampler::Cast(this->mContext->getFrontendObject().createSampler(
1588                 literalSampler.normalizedCoords, literalSampler.addressingMode,
1589                 literalSampler.filterMode)));
1590 
1591         // Release immediately to ensure correct refcount
1592         clLiteralSampler->release();
1593         ASSERT(clLiteralSampler != nullptr);
1594         CLSamplerVk &vkLiteralSampler = clLiteralSampler->getImpl<CLSamplerVk>();
1595 
1596         VkDescriptorImageInfo &samplerInfo =
1597             literalSamplerDescSetBuilder.allocDescriptorImageInfo();
1598         samplerInfo.sampler     = vkLiteralSampler.getSamplerHelper().get().getHandle();
1599         samplerInfo.imageView   = VK_NULL_HANDLE;
1600         samplerInfo.imageLayout = VK_IMAGE_LAYOUT_UNDEFINED;
1601 
1602         VkWriteDescriptorSet &writeDescriptorSet =
1603             literalSamplerDescSetBuilder.allocWriteDescriptorSet();
1604         writeDescriptorSet.descriptorCount = 1;
1605         writeDescriptorSet.descriptorType  = VK_DESCRIPTOR_TYPE_SAMPLER;
1606         writeDescriptorSet.pImageInfo      = &samplerInfo;
1607         writeDescriptorSet.sType           = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1608         writeDescriptorSet.dstSet = kernelVk.getDescriptorSet(DescriptorSetIndex::LiteralSampler);
1609         writeDescriptorSet.dstBinding = literalSampler.binding;
1610 
1611         mCommandsStateMap[mComputePassCommands->getQueueSerial()].samplers.emplace_back(
1612             clLiteralSampler);
1613     }
1614 
1615     CLKernelArguments args = kernelVk.getArgs();
1616     UpdateDescriptorSetsBuilder &kernelArgDescSetBuilder =
1617         updateDescriptorSetsBuilders[DescriptorSetIndex::KernelArguments];
1618     for (size_t index = 0; index < args.size(); index++)
1619     {
1620         const auto &arg = args.at(index);
1621         switch (arg.type)
1622         {
1623             case NonSemanticClspvReflectionArgumentUniform:
1624             case NonSemanticClspvReflectionArgumentStorageBuffer:
1625             {
1626                 cl::Memory *clMem = cl::Buffer::Cast(static_cast<const cl_mem>(arg.handle));
1627                 CLBufferVk &vkMem = clMem->getImpl<CLBufferVk>();
1628 
1629                 ANGLE_TRY(addMemoryDependencies(clMem));
1630 
1631                 // Update buffer/descriptor info
1632                 VkDescriptorBufferInfo &bufferInfo =
1633                     kernelArgDescSetBuilder.allocDescriptorBufferInfo();
1634                 bufferInfo.range  = clMem->getSize();
1635                 bufferInfo.offset = clMem->getOffset();
1636                 bufferInfo.buffer = vkMem.getBuffer().getBuffer().getHandle();
1637                 VkWriteDescriptorSet &writeDescriptorSet =
1638                     kernelArgDescSetBuilder.allocWriteDescriptorSet();
1639                 writeDescriptorSet.descriptorCount = 1;
1640                 writeDescriptorSet.descriptorType =
1641                     arg.type == NonSemanticClspvReflectionArgumentUniform
1642                         ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER
1643                         : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
1644                 writeDescriptorSet.pBufferInfo = &bufferInfo;
1645                 writeDescriptorSet.sType       = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1646                 writeDescriptorSet.dstSet =
1647                     kernelVk.getDescriptorSet(DescriptorSetIndex::KernelArguments);
1648                 writeDescriptorSet.dstBinding = arg.descriptorBinding;
1649                 break;
1650             }
1651             case NonSemanticClspvReflectionArgumentPodPushConstant:
1652             {
1653                 ASSERT(!podBufferPresent);
1654 
1655                 // Spec requires the size and offset to be multiple of 4, round up for size and
1656                 // round down for offset to ensure this
1657                 uint32_t offset = roundDownPow2(arg.pushConstOffset, 4u);
1658                 uint32_t size =
1659                     roundUpPow2(arg.pushConstOffset + arg.pushConstantSize, 4u) - offset;
1660                 ASSERT(offset + size <= kernelVk.getPodArgumentPushConstantsData().size());
1661                 mComputePassCommands->getCommandBuffer().pushConstants(
1662                     kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, offset, size,
1663                     &kernelVk.getPodArgumentPushConstantsData()[offset]);
1664                 break;
1665             }
1666             case NonSemanticClspvReflectionArgumentWorkgroup:
1667             {
1668                 // Nothing to do here (this is already taken care of during clSetKernelArg)
1669                 break;
1670             }
1671             case NonSemanticClspvReflectionArgumentSampler:
1672             {
1673                 cl::Sampler *clSampler =
1674                     cl::Sampler::Cast(*static_cast<const cl_sampler *>(arg.handle));
1675                 CLSamplerVk &vkSampler = clSampler->getImpl<CLSamplerVk>();
1676                 VkDescriptorImageInfo &samplerInfo =
1677                     kernelArgDescSetBuilder.allocDescriptorImageInfo();
1678                 samplerInfo.sampler = vkSampler.getSamplerHelper().get().getHandle();
1679                 VkWriteDescriptorSet &writeDescriptorSet =
1680                     kernelArgDescSetBuilder.allocWriteDescriptorSet();
1681                 writeDescriptorSet.descriptorCount = 1;
1682                 writeDescriptorSet.descriptorType  = VK_DESCRIPTOR_TYPE_SAMPLER;
1683                 writeDescriptorSet.pImageInfo      = &samplerInfo;
1684                 writeDescriptorSet.sType           = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1685                 writeDescriptorSet.dstSet =
1686                     kernelVk.getDescriptorSet(DescriptorSetIndex::KernelArguments);
1687                 writeDescriptorSet.dstBinding = arg.descriptorBinding;
1688 
1689                 const VkPushConstantRange *samplerMaskRange =
1690                     devProgramData->getNormalizedSamplerMaskRange(index);
1691                 if (samplerMaskRange != nullptr)
1692                 {
1693                     if (clSampler->getNormalizedCoords() == false)
1694                     {
1695                         ANGLE_TRY(vkSampler.createNormalized());
1696                         samplerInfo.sampler =
1697                             vkSampler.getSamplerHelperNormalized().get().getHandle();
1698                     }
1699                     uint32_t mask = vkSampler.getSamplerMask();
1700                     mComputePassCommands->getCommandBuffer().pushConstants(
1701                         kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT,
1702                         samplerMaskRange->offset, samplerMaskRange->size, &mask);
1703                 }
1704                 break;
1705             }
1706             case NonSemanticClspvReflectionArgumentStorageImage:
1707             case NonSemanticClspvReflectionArgumentSampledImage:
1708             {
1709                 cl::Memory *clMem = cl::Image::Cast(static_cast<const cl_mem>(arg.handle));
1710                 CLImageVk &vkMem  = clMem->getImpl<CLImageVk>();
1711 
1712                 ANGLE_TRY(addMemoryDependencies(clMem));
1713 
1714                 cl_image_format imageFormat = vkMem.getFormat();
1715                 const VkPushConstantRange *imageDataChannelOrderRange =
1716                     devProgramData->getImageDataChannelOrderRange(index);
1717                 if (imageDataChannelOrderRange != nullptr)
1718                 {
1719                     mComputePassCommands->getCommandBuffer().pushConstants(
1720                         kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT,
1721                         imageDataChannelOrderRange->offset, imageDataChannelOrderRange->size,
1722                         &imageFormat.image_channel_order);
1723                 }
1724 
1725                 const VkPushConstantRange *imageDataChannelDataTypeRange =
1726                     devProgramData->getImageDataChannelDataTypeRange(index);
1727                 if (imageDataChannelDataTypeRange != nullptr)
1728                 {
1729                     mComputePassCommands->getCommandBuffer().pushConstants(
1730                         kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT,
1731                         imageDataChannelDataTypeRange->offset, imageDataChannelDataTypeRange->size,
1732                         &imageFormat.image_channel_data_type);
1733                 }
1734 
1735                 // Update image/descriptor info
1736                 VkDescriptorImageInfo &imageInfo =
1737                     kernelArgDescSetBuilder.allocDescriptorImageInfo();
1738                 imageInfo.imageLayout = arg.type == NonSemanticClspvReflectionArgumentStorageImage
1739                                             ? VK_IMAGE_LAYOUT_GENERAL
1740                                             : vkMem.getImage().getCurrentLayout();
1741                 imageInfo.imageView = vkMem.getImageView().getHandle();
1742                 imageInfo.sampler   = VK_NULL_HANDLE;
1743                 VkWriteDescriptorSet &writeDescriptorSet =
1744                     kernelArgDescSetBuilder.allocWriteDescriptorSet();
1745                 writeDescriptorSet.descriptorCount = 1;
1746                 writeDescriptorSet.descriptorType =
1747                     arg.type == NonSemanticClspvReflectionArgumentStorageImage
1748                         ? VK_DESCRIPTOR_TYPE_STORAGE_IMAGE
1749                         : VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE;
1750                 writeDescriptorSet.pImageInfo = &imageInfo;
1751                 writeDescriptorSet.sType      = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1752                 writeDescriptorSet.dstSet =
1753                     kernelVk.getDescriptorSet(DescriptorSetIndex::KernelArguments);
1754                 writeDescriptorSet.dstBinding = arg.descriptorBinding;
1755                 break;
1756             }
1757             case NonSemanticClspvReflectionArgumentUniformTexelBuffer:
1758             case NonSemanticClspvReflectionArgumentStorageTexelBuffer:
1759             {
1760                 cl::Memory *clMem = cl::Image::Cast(static_cast<const cl_mem>(arg.handle));
1761                 CLImageVk &vkMem  = clMem->getImpl<CLImageVk>();
1762 
1763                 ANGLE_TRY(addMemoryDependencies(clMem));
1764 
1765                 VkBufferView &bufferView           = kernelArgDescSetBuilder.allocBufferView();
1766                 const vk::BufferView *vkBufferView = nullptr;
1767                 ANGLE_TRY(vkMem.getBufferView(&vkBufferView));
1768                 bufferView = vkBufferView->getHandle();
1769 
1770                 VkWriteDescriptorSet &writeDescriptorSet =
1771                     kernelArgDescSetBuilder.allocWriteDescriptorSet();
1772                 writeDescriptorSet.descriptorCount = 1;
1773                 writeDescriptorSet.descriptorType =
1774                     arg.type == NonSemanticClspvReflectionArgumentStorageTexelBuffer
1775                         ? VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER
1776                         : VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER;
1777                 writeDescriptorSet.pImageInfo = nullptr;
1778                 writeDescriptorSet.sType      = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1779                 writeDescriptorSet.dstSet =
1780                     kernelVk.getDescriptorSet(DescriptorSetIndex::KernelArguments);
1781                 writeDescriptorSet.dstBinding       = arg.descriptorBinding;
1782                 writeDescriptorSet.pTexelBufferView = &bufferView;
1783 
1784                 break;
1785             }
1786             case NonSemanticClspvReflectionArgumentPodUniform:
1787             case NonSemanticClspvReflectionArgumentPodStorageBuffer:
1788             {
1789                 if (!podBufferPresent)
1790                 {
1791                     podBufferPresent  = true;
1792                     podBinding        = arg.descriptorBinding;
1793                     podDescriptorType = arg.type == NonSemanticClspvReflectionArgumentPodUniform
1794                                             ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER
1795                                             : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
1796                 }
1797                 break;
1798             }
1799             case NonSemanticClspvReflectionArgumentPointerUniform:
1800             case NonSemanticClspvReflectionArgumentPointerPushConstant:
1801             default:
1802             {
1803                 UNIMPLEMENTED();
1804                 break;
1805             }
1806         }
1807     }
1808     if (podBufferPresent)
1809     {
1810         cl::MemoryPtr clMem = kernelVk.getPodBuffer();
1811         ASSERT(clMem != nullptr);
1812         CLBufferVk &vkMem = clMem->getImpl<CLBufferVk>();
1813 
1814         VkDescriptorBufferInfo &bufferInfo = kernelArgDescSetBuilder.allocDescriptorBufferInfo();
1815         bufferInfo.range                   = clMem->getSize();
1816         bufferInfo.offset                  = clMem->getOffset();
1817         bufferInfo.buffer                  = vkMem.getBuffer().getBuffer().getHandle();
1818 
1819         ANGLE_TRY(addMemoryDependencies(clMem.get()));
1820 
1821         VkWriteDescriptorSet &writeDescriptorSet =
1822             kernelArgDescSetBuilder.allocWriteDescriptorSet();
1823         writeDescriptorSet.sType  = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1824         writeDescriptorSet.pNext  = nullptr;
1825         writeDescriptorSet.dstSet = kernelVk.getDescriptorSet(DescriptorSetIndex::KernelArguments);
1826         writeDescriptorSet.dstBinding      = podBinding;
1827         writeDescriptorSet.dstArrayElement = 0;
1828         writeDescriptorSet.descriptorCount = 1;
1829         writeDescriptorSet.descriptorType  = podDescriptorType;
1830         writeDescriptorSet.pImageInfo      = nullptr;
1831         writeDescriptorSet.pBufferInfo     = &bufferInfo;
1832     }
1833 
1834     // process the printf storage buffer
1835     if (kernelVk.usesPrintf())
1836     {
1837         UpdateDescriptorSetsBuilder &printfDescSetBuilder =
1838             updateDescriptorSetsBuilders[DescriptorSetIndex::Printf];
1839 
1840         cl::MemoryPtr clMem = getOrCreatePrintfBuffer();
1841         CLBufferVk &vkMem   = clMem->getImpl<CLBufferVk>();
1842         uint8_t *mapPointer = nullptr;
1843         ANGLE_TRY(vkMem.map(mapPointer, 0));
1844         // The spec calls out *The first 4 bytes of the buffer should be zero-initialized.*
1845         memset(mapPointer, 0, 4);
1846 
1847         auto &bufferInfo  = printfDescSetBuilder.allocDescriptorBufferInfo();
1848         bufferInfo.range  = clMem->getSize();
1849         bufferInfo.offset = clMem->getOffset();
1850         bufferInfo.buffer = vkMem.getBuffer().getBuffer().getHandle();
1851 
1852         auto &writeDescriptorSet           = printfDescSetBuilder.allocWriteDescriptorSet();
1853         writeDescriptorSet.descriptorCount = 1;
1854         writeDescriptorSet.descriptorType  = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
1855         writeDescriptorSet.pBufferInfo     = &bufferInfo;
1856         writeDescriptorSet.sType           = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1857         writeDescriptorSet.dstSet          = kernelVk.getDescriptorSet(DescriptorSetIndex::Printf);
1858         writeDescriptorSet.dstBinding      = kernelVk.getProgram()
1859                                             ->getDeviceProgramData(kernelVk.getKernelName().c_str())
1860                                             ->reflectionData.printfBufferStorage.binding;
1861 
1862         mNeedPrintfHandling = true;
1863         mPrintfInfos        = kernelVk.getProgram()->getPrintfDescriptors(kernelVk.getKernelName());
1864     }
1865 
1866     angle::EnumIterator<DescriptorSetIndex> descriptorSetIndex(DescriptorSetIndex::LiteralSampler);
1867     for (DescriptorSetIndex index : angle::AllEnums<DescriptorSetIndex>())
1868     {
1869         if (!kernelVk.getDescriptorSetLayoutDesc(index).empty())
1870         {
1871             mContext->getPerfCounters().writeDescriptorSets =
1872                 updateDescriptorSetsBuilders[index].flushDescriptorSetUpdates(
1873                     mContext->getRenderer()->getDevice());
1874 
1875             VkDescriptorSet descriptorSet = kernelVk.getDescriptorSet(index);
1876             mComputePassCommands->getCommandBuffer().bindDescriptorSets(
1877                 kernelVk.getPipelineLayout(), VK_PIPELINE_BIND_POINT_COMPUTE, *descriptorSetIndex,
1878                 1, &descriptorSet, 0, nullptr);
1879 
1880             ++descriptorSetIndex;
1881         }
1882     }
1883 
1884     return angle::Result::Continue;
1885 }
1886 
processGlobalPushConstants(CLKernelVk & kernelVk,const cl::NDRange & ndrange)1887 angle::Result CLCommandQueueVk::processGlobalPushConstants(CLKernelVk &kernelVk,
1888                                                            const cl::NDRange &ndrange)
1889 {
1890     const CLProgramVk::DeviceProgramData *devProgramData =
1891         kernelVk.getProgram()->getDeviceProgramData(mCommandQueue.getDevice().getNative());
1892     ASSERT(devProgramData != nullptr);
1893 
1894     const VkPushConstantRange *globalOffsetRange = devProgramData->getGlobalOffsetRange();
1895     if (globalOffsetRange != nullptr)
1896     {
1897         mComputePassCommands->getCommandBuffer().pushConstants(
1898             kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, globalOffsetRange->offset,
1899             globalOffsetRange->size, ndrange.globalWorkOffset.data());
1900     }
1901 
1902     const VkPushConstantRange *globalSizeRange = devProgramData->getGlobalSizeRange();
1903     if (globalSizeRange != nullptr)
1904     {
1905         mComputePassCommands->getCommandBuffer().pushConstants(
1906             kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, globalSizeRange->offset,
1907             globalSizeRange->size, ndrange.globalWorkSize.data());
1908     }
1909 
1910     const VkPushConstantRange *enqueuedLocalSizeRange = devProgramData->getEnqueuedLocalSizeRange();
1911     if (enqueuedLocalSizeRange != nullptr)
1912     {
1913         mComputePassCommands->getCommandBuffer().pushConstants(
1914             kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT,
1915             enqueuedLocalSizeRange->offset, enqueuedLocalSizeRange->size,
1916             ndrange.localWorkSize.data());
1917     }
1918 
1919     const VkPushConstantRange *numWorkgroupsRange = devProgramData->getNumWorkgroupsRange();
1920     if (devProgramData->reflectionData.pushConstants.contains(
1921             NonSemanticClspvReflectionPushConstantNumWorkgroups))
1922     {
1923         // We support non-uniform workgroups, thus take the ceil of the quotient
1924         uint32_t numWorkgroups[3] = {
1925             UnsignedCeilDivide(ndrange.globalWorkSize[0], ndrange.localWorkSize[0]),
1926             UnsignedCeilDivide(ndrange.globalWorkSize[1], ndrange.localWorkSize[1]),
1927             UnsignedCeilDivide(ndrange.globalWorkSize[2], ndrange.localWorkSize[2])};
1928         mComputePassCommands->getCommandBuffer().pushConstants(
1929             kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, numWorkgroupsRange->offset,
1930             numWorkgroupsRange->size, &numWorkgroups);
1931     }
1932 
1933     return angle::Result::Continue;
1934 }
1935 
flushComputePassCommands()1936 angle::Result CLCommandQueueVk::flushComputePassCommands()
1937 {
1938     if (mComputePassCommands->empty())
1939     {
1940         return angle::Result::Continue;
1941     }
1942 
1943     // Flush any host visible buffers by adding appropriate barriers
1944     if (mComputePassCommands->getAndResetHasHostVisibleBufferWrite())
1945     {
1946         // Make sure all writes to host-visible buffers are flushed.
1947         VkMemoryBarrier memoryBarrier = {};
1948         memoryBarrier.sType           = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
1949         memoryBarrier.srcAccessMask   = VK_ACCESS_MEMORY_WRITE_BIT;
1950         memoryBarrier.dstAccessMask   = VK_ACCESS_HOST_READ_BIT | VK_ACCESS_HOST_WRITE_BIT;
1951 
1952         mComputePassCommands->getCommandBuffer().memoryBarrier(
1953             VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
1954             VK_PIPELINE_STAGE_HOST_BIT, memoryBarrier);
1955     }
1956 
1957     // get hold of the queue serial that is flushed, post the flush the command buffer will be reset
1958     mLastFlushedQueueSerial = mComputePassCommands->getQueueSerial();
1959     // Here, we flush our compute cmds to RendererVk's primary command buffer
1960     ANGLE_TRY(mContext->getRenderer()->flushOutsideRPCommands(
1961         mContext, getProtectionType(), egl::ContextPriority::Medium, &mComputePassCommands));
1962 
1963     mContext->getPerfCounters().flushedOutsideRenderPassCommandBuffers++;
1964 
1965     // Generate new serial for next batch of cmds
1966     mComputePassCommands->setQueueSerial(
1967         mQueueSerialIndex, mContext->getRenderer()->generateQueueSerial(mQueueSerialIndex));
1968 
1969     return angle::Result::Continue;
1970 }
1971 
processWaitlist(const cl::EventPtrs & waitEvents)1972 angle::Result CLCommandQueueVk::processWaitlist(const cl::EventPtrs &waitEvents)
1973 {
1974     if (!waitEvents.empty())
1975     {
1976         bool insertedBarrier = false;
1977         for (const cl::EventPtr &event : waitEvents)
1978         {
1979             if (event->getImpl<CLEventVk>().isUserEvent() ||
1980                 event->getCommandQueue() != &mCommandQueue)
1981             {
1982                 // We cannot use a barrier in these cases, therefore defer the event
1983                 // handling till submission time
1984                 // TODO: Perhaps we could utilize VkEvents here instead and have GPU wait(s)
1985                 // https://anglebug.com/42267109
1986                 mExternalEvents.push_back(event);
1987             }
1988             else if (event->getCommandQueue() == &mCommandQueue && !insertedBarrier)
1989             {
1990                 // As long as there is at least one dependant command in same queue,
1991                 // we just need to insert one execution barrier
1992                 ANGLE_TRY(insertBarrier());
1993 
1994                 insertedBarrier = true;
1995             }
1996         }
1997     }
1998     return angle::Result::Continue;
1999 }
2000 
submitCommands()2001 angle::Result CLCommandQueueVk::submitCommands()
2002 {
2003     ANGLE_TRACE_EVENT0("gpu.angle", "CLCommandQueueVk::submitCommands()");
2004 
2005     ASSERT(hasCommandsPendingSubmission());
2006 
2007     // Kick off renderer submit
2008     ANGLE_TRY(mContext->getRenderer()->submitCommands(mContext, getProtectionType(),
2009                                                       egl::ContextPriority::Medium, nullptr,
2010                                                       nullptr, {}, mLastFlushedQueueSerial));
2011 
2012     mLastSubmittedQueueSerial = mLastFlushedQueueSerial;
2013 
2014     // Now that we have submitted commands, some of pending garbage may no longer pending
2015     // and should be moved to garbage list.
2016     mContext->getRenderer()->cleanupPendingSubmissionGarbage();
2017 
2018     return angle::Result::Continue;
2019 }
2020 
createEvent(CLEventImpl::CreateFunc * createFunc,cl::ExecutionStatus initialStatus)2021 angle::Result CLCommandQueueVk::createEvent(CLEventImpl::CreateFunc *createFunc,
2022                                             cl::ExecutionStatus initialStatus)
2023 {
2024     if (createFunc != nullptr)
2025     {
2026         *createFunc = [initialStatus, queueSerial = mComputePassCommands->getQueueSerial()](
2027                           const cl::Event &event) {
2028             auto eventVk = new (std::nothrow) CLEventVk(event, initialStatus, queueSerial);
2029             if (eventVk == nullptr)
2030             {
2031                 ERR() << "Failed to create cmd event obj!";
2032                 return CLEventImpl::Ptr(nullptr);
2033             }
2034             return CLEventImpl::Ptr(eventVk);
2035         };
2036     }
2037     return angle::Result::Continue;
2038 }
2039 
submitEmptyCommand()2040 angle::Result CLCommandQueueVk::submitEmptyCommand()
2041 {
2042     // This will be called as part of resetting the command buffer and command buffer has to be
2043     // empty.
2044     ASSERT(mComputePassCommands->empty());
2045 
2046     // There is nothing to be flushed, mark it flushed and do a submit to signal the queue serial
2047     mLastFlushedQueueSerial = mComputePassCommands->getQueueSerial();
2048     ANGLE_TRY(submitCommands());
2049     ANGLE_TRY(finishQueueSerialInternal(mLastSubmittedQueueSerial));
2050 
2051     // increment the queue serial for the next command batch
2052     mComputePassCommands->setQueueSerial(
2053         mQueueSerialIndex, mContext->getRenderer()->generateQueueSerial(mQueueSerialIndex));
2054 
2055     return angle::Result::Continue;
2056 }
2057 
resetCommandBufferWithError(cl_int errorCode)2058 angle::Result CLCommandQueueVk::resetCommandBufferWithError(cl_int errorCode)
2059 {
2060     // Got an error so reset the command buffer and report back error to all the associated
2061     // events
2062     ASSERT(errorCode != CL_SUCCESS);
2063 
2064     QueueSerial currentSerial = mComputePassCommands->getQueueSerial();
2065     mComputePassCommands->getCommandBuffer().reset();
2066 
2067     for (cl::EventPtr event : mCommandsStateMap[currentSerial].events)
2068     {
2069         CLEventVk *eventVk = &event->getImpl<CLEventVk>();
2070         if (!eventVk->isUserEvent())
2071         {
2072             ANGLE_TRY(
2073                 eventVk->setStatusAndExecuteCallback(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST));
2074         }
2075     }
2076     mCommandsStateMap.erase(currentSerial);
2077     mExternalEvents.clear();
2078 
2079     // Command buffer has been reset and as such the associated queue serial will not get signaled
2080     // leading to causality issues. So submit an empty command to keep the queue serials timelines
2081     // intact.
2082     ANGLE_TRY(submitEmptyCommand());
2083 
2084     ANGLE_CL_RETURN_ERROR(errorCode);
2085 }
2086 
finishQueueSerialInternal(const QueueSerial queueSerial)2087 angle::Result CLCommandQueueVk::finishQueueSerialInternal(const QueueSerial queueSerial)
2088 {
2089     // Queue serial must belong to this queue and work must have been submitted.
2090     ASSERT(queueSerial.getIndex() == mQueueSerialIndex);
2091     ASSERT(mContext->getRenderer()->hasQueueSerialSubmitted(queueSerial));
2092 
2093     ANGLE_TRY(mContext->getRenderer()->finishQueueSerial(mContext, queueSerial));
2094 
2095     // Ensure memory  objects are synced back to host CPU
2096     ANGLE_TRY(syncHostBuffers(mCommandsStateMap[queueSerial].hostTransferList));
2097 
2098     if (mNeedPrintfHandling)
2099     {
2100         ANGLE_TRY(processPrintfBuffer());
2101         mNeedPrintfHandling = false;
2102     }
2103 
2104     // Events associated with this queue serial and ready to be marked complete
2105     ANGLE_TRY(SetEventsWithQueueSerialToState(mCommandsStateMap[queueSerial].events, queueSerial,
2106                                               cl::ExecutionStatus::Complete));
2107 
2108     mExternalEvents.clear();
2109     mCommandsStateMap.erase(queueSerial);
2110 
2111     return angle::Result::Continue;
2112 }
2113 
finishQueueSerial(const QueueSerial queueSerial)2114 angle::Result CLCommandQueueVk::finishQueueSerial(const QueueSerial queueSerial)
2115 {
2116     ASSERT(queueSerial.getIndex() == getQueueSerialIndex());
2117     ASSERT(mContext->getRenderer()->hasQueueSerialSubmitted(queueSerial));
2118 
2119     ANGLE_TRY(mContext->getRenderer()->finishQueueSerial(mContext, queueSerial));
2120 
2121     std::lock_guard<std::mutex> sl(mCommandQueueMutex);
2122 
2123     return finishQueueSerialInternal(queueSerial);
2124 }
2125 
flushInternal()2126 angle::Result CLCommandQueueVk::flushInternal()
2127 {
2128     if (!mComputePassCommands->empty())
2129     {
2130         // If we still have dependant events, handle them now
2131         if (!mExternalEvents.empty())
2132         {
2133             for (const auto &depEvent : mExternalEvents)
2134             {
2135                 if (depEvent->getImpl<CLEventVk>().isUserEvent())
2136                 {
2137                     // We just wait here for user to set the event object
2138                     cl_int status = CL_QUEUED;
2139                     ANGLE_TRY(depEvent->getImpl<CLEventVk>().waitForUserEventStatus());
2140                     ANGLE_TRY(depEvent->getImpl<CLEventVk>().getCommandExecutionStatus(status));
2141                     if (status < 0)
2142                     {
2143                         ERR() << "Invalid dependant user-event (" << depEvent.get()
2144                               << ") status encountered!";
2145                         ANGLE_TRY(resetCommandBufferWithError(
2146                             CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST));
2147                     }
2148                 }
2149                 else
2150                 {
2151                     // Otherwise, we just need to submit/finish for dependant event queues
2152                     // here that are not associated with this queue
2153                     ANGLE_TRY(depEvent->getCommandQueue()->finish());
2154                 }
2155             }
2156             mExternalEvents.clear();
2157         }
2158 
2159         ANGLE_TRY(flushComputePassCommands());
2160         CommandsState commandsState = mCommandsStateMap[mLastFlushedQueueSerial];
2161         ANGLE_TRY(SetEventsWithQueueSerialToState(commandsState.events, mLastFlushedQueueSerial,
2162                                                   cl::ExecutionStatus::Submitted));
2163 
2164         ANGLE_TRY(submitCommands());
2165         ASSERT(!hasCommandsPendingSubmission());
2166         ANGLE_TRY(SetEventsWithQueueSerialToState(commandsState.events, mLastSubmittedQueueSerial,
2167                                                   cl::ExecutionStatus::Running));
2168     }
2169 
2170     return angle::Result::Continue;
2171 }
2172 
finishInternal()2173 angle::Result CLCommandQueueVk::finishInternal()
2174 {
2175     ANGLE_TRACE_EVENT0("gpu.angle", "CLCommandQueueVk::finish");
2176     ANGLE_TRY(flushInternal());
2177 
2178     return finishQueueSerialInternal(mLastSubmittedQueueSerial);
2179 }
2180 
2181 // Helper function to insert appropriate memory barriers before accessing the resources in the
2182 // command buffer.
onResourceAccess(const vk::CommandBufferAccess & access)2183 angle::Result CLCommandQueueVk::onResourceAccess(const vk::CommandBufferAccess &access)
2184 {
2185     // Buffers
2186     for (const vk::CommandBufferBufferAccess &bufferAccess : access.getReadBuffers())
2187     {
2188         if (mComputePassCommands->usesBufferForWrite(*bufferAccess.buffer))
2189         {
2190             // read buffers only need a new command buffer if previously used for write
2191             ANGLE_TRY(flushInternal());
2192         }
2193 
2194         mComputePassCommands->bufferRead(mContext, bufferAccess.accessType, bufferAccess.stage,
2195                                          bufferAccess.buffer);
2196     }
2197 
2198     for (const vk::CommandBufferBufferAccess &bufferAccess : access.getWriteBuffers())
2199     {
2200         if (mComputePassCommands->usesBuffer(*bufferAccess.buffer))
2201         {
2202             // write buffers always need a new command buffer
2203             ANGLE_TRY(flushInternal());
2204         }
2205 
2206         mComputePassCommands->bufferWrite(mContext, bufferAccess.accessType, bufferAccess.stage,
2207                                           bufferAccess.buffer);
2208         if (bufferAccess.buffer->isHostVisible())
2209         {
2210             // currently all are host visible so nothing to do
2211         }
2212     }
2213 
2214     for (const vk::CommandBufferBufferExternalAcquireRelease &bufferAcquireRelease :
2215          access.getExternalAcquireReleaseBuffers())
2216     {
2217         mComputePassCommands->retainResourceForWrite(bufferAcquireRelease.buffer);
2218     }
2219 
2220     for (const vk::CommandBufferResourceAccess &resourceAccess : access.getAccessResources())
2221     {
2222         mComputePassCommands->retainResource(resourceAccess.resource);
2223     }
2224 
2225     return angle::Result::Continue;
2226 }
2227 
processPrintfBuffer()2228 angle::Result CLCommandQueueVk::processPrintfBuffer()
2229 {
2230     ASSERT(mPrintfBuffer);
2231     ASSERT(mNeedPrintfHandling);
2232     ASSERT(mPrintfInfos);
2233 
2234     cl::MemoryPtr clMem = getOrCreatePrintfBuffer();
2235     CLBufferVk &vkMem   = clMem->getImpl<CLBufferVk>();
2236 
2237     unsigned char *data = nullptr;
2238     ANGLE_TRY(vkMem.map(data, 0));
2239     ANGLE_TRY(ClspvProcessPrintfBuffer(data, vkMem.getSize(), mPrintfInfos));
2240     vkMem.unmap();
2241 
2242     return angle::Result::Continue;
2243 }
2244 
2245 // A single CL buffer is setup for every command queue of size kPrintfBufferSize. This can be
2246 // expanded later, if more storage is needed.
getOrCreatePrintfBuffer()2247 cl::MemoryPtr CLCommandQueueVk::getOrCreatePrintfBuffer()
2248 {
2249     if (!mPrintfBuffer)
2250     {
2251         mPrintfBuffer = cl::Buffer::Cast(mContext->getFrontendObject().createBuffer(
2252             nullptr, cl::MemFlags(CL_MEM_READ_WRITE), kPrintfBufferSize, nullptr));
2253     }
2254     return cl::MemoryPtr(mPrintfBuffer);
2255 }
2256 
hasUserEventDependency() const2257 bool CLCommandQueueVk::hasUserEventDependency() const
2258 {
2259     return std::any_of(mExternalEvents.begin(), mExternalEvents.end(),
2260                        [](const cl::EventPtr event) { return event->isUserEvent(); });
2261 }
2262 
addEventReference(CLEventVk & eventVk)2263 void CLCommandQueueVk::addEventReference(CLEventVk &eventVk)
2264 {
2265     ASSERT(eventVk.getQueueSerial().valid());
2266     ASSERT(eventVk.getQueueSerial().getIndex() == mQueueSerialIndex);
2267 
2268     std::lock_guard<std::mutex> lock(mCommandQueueMutex);
2269 
2270     mCommandsStateMap[eventVk.getQueueSerial()].events.emplace_back(&eventVk.getFrontendObject());
2271 }
2272 
2273 }  // namespace rx
2274