1 //
2 // Copyright 2021 The ANGLE Project Authors. All rights reserved.
3 // Use of this source code is governed by a BSD-style license that can be
4 // found in the LICENSE file.
5 //
6 // CLCommandQueueVk.cpp: Implements the class methods for CLCommandQueueVk.
7
8 #include "libANGLE/renderer/vulkan/CLCommandQueueVk.h"
9 #include "libANGLE/renderer/vulkan/CLContextVk.h"
10 #include "libANGLE/renderer/vulkan/CLDeviceVk.h"
11 #include "libANGLE/renderer/vulkan/CLKernelVk.h"
12 #include "libANGLE/renderer/vulkan/CLMemoryVk.h"
13 #include "libANGLE/renderer/vulkan/CLProgramVk.h"
14 #include "libANGLE/renderer/vulkan/cl_types.h"
15 #include "libANGLE/renderer/vulkan/vk_renderer.h"
16
17 #include "libANGLE/CLBuffer.h"
18 #include "libANGLE/CLCommandQueue.h"
19 #include "libANGLE/CLContext.h"
20 #include "libANGLE/CLEvent.h"
21 #include "libANGLE/CLKernel.h"
22 #include "libANGLE/cl_utils.h"
23
24 #include "spirv/unified1/NonSemanticClspvReflection.h"
25
26 namespace rx
27 {
28
29 class CLAsyncFinishTask : public angle::Closure
30 {
31 public:
CLAsyncFinishTask(CLCommandQueueVk * queueVk)32 CLAsyncFinishTask(CLCommandQueueVk *queueVk) : mQueueVk(queueVk) {}
33
operator ()()34 void operator()() override
35 {
36 ANGLE_TRACE_EVENT0("gpu.angle", "CLCommandQueueVk::finish (async)");
37 if (IsError(mQueueVk->finish()))
38 {
39 ERR() << "Async finish (clFlush) failed for queue (" << mQueueVk << ")!";
40 }
41 }
42
43 private:
44 CLCommandQueueVk *mQueueVk;
45 };
46
CLCommandQueueVk(const cl::CommandQueue & commandQueue)47 CLCommandQueueVk::CLCommandQueueVk(const cl::CommandQueue &commandQueue)
48 : CLCommandQueueImpl(commandQueue),
49 mContext(&commandQueue.getContext().getImpl<CLContextVk>()),
50 mDevice(&commandQueue.getDevice().getImpl<CLDeviceVk>()),
51 mComputePassCommands(nullptr),
52 mCurrentQueueSerialIndex(kInvalidQueueSerialIndex),
53 mHasAnyCommandsPendingSubmission(false)
54 {}
55
init()56 angle::Result CLCommandQueueVk::init()
57 {
58 ANGLE_CL_IMPL_TRY_ERROR(
59 vk::OutsideRenderPassCommandBuffer::InitializeCommandPool(
60 mContext, &mCommandPool.outsideRenderPassPool,
61 mContext->getRenderer()->getQueueFamilyIndex(), getProtectionType()),
62 CL_OUT_OF_RESOURCES);
63
64 ANGLE_CL_IMPL_TRY_ERROR(mContext->getRenderer()->getOutsideRenderPassCommandBufferHelper(
65 mContext, &mCommandPool.outsideRenderPassPool,
66 &mOutsideRenderPassCommandsAllocator, &mComputePassCommands),
67 CL_OUT_OF_RESOURCES);
68
69 // Generate initial QueueSerial for command buffer helper
70 ANGLE_CL_IMPL_TRY_ERROR(
71 mContext->getRenderer()->allocateQueueSerialIndex(&mCurrentQueueSerialIndex),
72 CL_OUT_OF_RESOURCES);
73 mComputePassCommands->setQueueSerial(
74 mCurrentQueueSerialIndex,
75 mContext->getRenderer()->generateQueueSerial(mCurrentQueueSerialIndex));
76
77 // Initialize serials to be valid but appear submitted and finished.
78 mLastFlushedQueueSerial = QueueSerial(mCurrentQueueSerialIndex, Serial());
79 mLastSubmittedQueueSerial = mLastFlushedQueueSerial;
80
81 return angle::Result::Continue;
82 }
83
~CLCommandQueueVk()84 CLCommandQueueVk::~CLCommandQueueVk()
85 {
86 VkDevice vkDevice = mContext->getDevice();
87
88 if (mCurrentQueueSerialIndex != kInvalidQueueSerialIndex)
89 {
90 mContext->getRenderer()->releaseQueueSerialIndex(mCurrentQueueSerialIndex);
91 mCurrentQueueSerialIndex = kInvalidQueueSerialIndex;
92 }
93
94 // Recycle the current command buffers
95 mContext->getRenderer()->recycleOutsideRenderPassCommandBufferHelper(&mComputePassCommands);
96 mCommandPool.outsideRenderPassPool.destroy(vkDevice);
97 }
98
setProperty(cl::CommandQueueProperties properties,cl_bool enable)99 angle::Result CLCommandQueueVk::setProperty(cl::CommandQueueProperties properties, cl_bool enable)
100 {
101 // NOTE: "clSetCommandQueueProperty" has been deprecated as of OpenCL 1.1
102 // http://man.opencl.org/deprecated.html
103 return angle::Result::Continue;
104 }
105
enqueueReadBuffer(const cl::Buffer & buffer,bool blocking,size_t offset,size_t size,void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)106 angle::Result CLCommandQueueVk::enqueueReadBuffer(const cl::Buffer &buffer,
107 bool blocking,
108 size_t offset,
109 size_t size,
110 void *ptr,
111 const cl::EventPtrs &waitEvents,
112 CLEventImpl::CreateFunc *eventCreateFunc)
113 {
114 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
115
116 ANGLE_TRY(processWaitlist(waitEvents));
117
118 if (blocking)
119 {
120 ANGLE_TRY(finishInternal());
121 auto bufferVk = &buffer.getImpl<CLBufferVk>();
122 ANGLE_TRY(bufferVk->copyTo(ptr, offset, size));
123 }
124 else
125 {
126 CLBufferVk &bufferVk = buffer.getImpl<CLBufferVk>();
127
128 // Reached transfer buffer creation limit/heuristic, finish this current batch
129 if (mHostBufferUpdateList.size() >= kMaxHostBufferUpdateListSize)
130 {
131 ANGLE_TRY(finishInternal());
132 }
133
134 // Create a transfer buffer and push it in update list
135 mHostBufferUpdateList.emplace_back(
136 cl::Buffer::Cast(this->mContext->getFrontendObject().createBuffer(
137 nullptr, cl::MemFlags{buffer.getFlags().get() | CL_MEM_USE_HOST_PTR},
138 buffer.getSize(), ptr)));
139 if (mHostBufferUpdateList.back() == nullptr)
140 {
141 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
142 }
143 CLBufferVk &transferBufferVk = mHostBufferUpdateList.back()->getImpl<CLBufferVk>();
144 // Release initialization reference, lifetime controlled by RefPointer.
145 mHostBufferUpdateList.back()->release();
146
147 const VkBufferCopy copyRegion = {offset, offset, size};
148
149 // We need an execution barrier if buffer can be written to by kernel
150 if (!mComputePassCommands->getCommandBuffer().empty() && bufferVk.isWritable())
151 {
152 VkMemoryBarrier memoryBarrier = {
153 VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr, VK_ACCESS_SHADER_WRITE_BIT,
154 VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT};
155 mComputePassCommands->getCommandBuffer().pipelineBarrier(
156 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1,
157 &memoryBarrier, 0, nullptr, 0, nullptr);
158 }
159
160 mComputePassCommands->getCommandBuffer().copyBuffer(
161 bufferVk.getBuffer().getBuffer(), transferBufferVk.getBuffer().getBuffer(), 1,
162 ©Region);
163 }
164
165 ANGLE_TRY(createEvent(eventCreateFunc));
166
167 return angle::Result::Continue;
168 }
169
enqueueWriteBuffer(const cl::Buffer & buffer,bool blocking,size_t offset,size_t size,const void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)170 angle::Result CLCommandQueueVk::enqueueWriteBuffer(const cl::Buffer &buffer,
171 bool blocking,
172 size_t offset,
173 size_t size,
174 const void *ptr,
175 const cl::EventPtrs &waitEvents,
176 CLEventImpl::CreateFunc *eventCreateFunc)
177 {
178 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
179
180 ANGLE_TRY(processWaitlist(waitEvents));
181
182 auto bufferVk = &buffer.getImpl<CLBufferVk>();
183 ANGLE_TRY(bufferVk->copyFrom(ptr, offset, size));
184 if (blocking)
185 {
186 ANGLE_TRY(finishInternal());
187 }
188
189 ANGLE_TRY(createEvent(eventCreateFunc));
190
191 return angle::Result::Continue;
192 }
193
enqueueReadBufferRect(const cl::Buffer & buffer,bool blocking,const cl::MemOffsets & bufferOrigin,const cl::MemOffsets & hostOrigin,const cl::Coordinate & region,size_t bufferRowPitch,size_t bufferSlicePitch,size_t hostRowPitch,size_t hostSlicePitch,void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)194 angle::Result CLCommandQueueVk::enqueueReadBufferRect(const cl::Buffer &buffer,
195 bool blocking,
196 const cl::MemOffsets &bufferOrigin,
197 const cl::MemOffsets &hostOrigin,
198 const cl::Coordinate ®ion,
199 size_t bufferRowPitch,
200 size_t bufferSlicePitch,
201 size_t hostRowPitch,
202 size_t hostSlicePitch,
203 void *ptr,
204 const cl::EventPtrs &waitEvents,
205 CLEventImpl::CreateFunc *eventCreateFunc)
206 {
207 UNIMPLEMENTED();
208 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
209 }
210
enqueueWriteBufferRect(const cl::Buffer & buffer,bool blocking,const cl::MemOffsets & bufferOrigin,const cl::MemOffsets & hostOrigin,const cl::Coordinate & region,size_t bufferRowPitch,size_t bufferSlicePitch,size_t hostRowPitch,size_t hostSlicePitch,const void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)211 angle::Result CLCommandQueueVk::enqueueWriteBufferRect(const cl::Buffer &buffer,
212 bool blocking,
213 const cl::MemOffsets &bufferOrigin,
214 const cl::MemOffsets &hostOrigin,
215 const cl::Coordinate ®ion,
216 size_t bufferRowPitch,
217 size_t bufferSlicePitch,
218 size_t hostRowPitch,
219 size_t hostSlicePitch,
220 const void *ptr,
221 const cl::EventPtrs &waitEvents,
222 CLEventImpl::CreateFunc *eventCreateFunc)
223 {
224 UNIMPLEMENTED();
225 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
226 }
227
enqueueCopyBuffer(const cl::Buffer & srcBuffer,const cl::Buffer & dstBuffer,size_t srcOffset,size_t dstOffset,size_t size,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)228 angle::Result CLCommandQueueVk::enqueueCopyBuffer(const cl::Buffer &srcBuffer,
229 const cl::Buffer &dstBuffer,
230 size_t srcOffset,
231 size_t dstOffset,
232 size_t size,
233 const cl::EventPtrs &waitEvents,
234 CLEventImpl::CreateFunc *eventCreateFunc)
235 {
236 UNIMPLEMENTED();
237 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
238 }
239
enqueueCopyBufferRect(const cl::Buffer & srcBuffer,const cl::Buffer & dstBuffer,const cl::MemOffsets & srcOrigin,const cl::MemOffsets & dstOrigin,const cl::Coordinate & region,size_t srcRowPitch,size_t srcSlicePitch,size_t dstRowPitch,size_t dstSlicePitch,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)240 angle::Result CLCommandQueueVk::enqueueCopyBufferRect(const cl::Buffer &srcBuffer,
241 const cl::Buffer &dstBuffer,
242 const cl::MemOffsets &srcOrigin,
243 const cl::MemOffsets &dstOrigin,
244 const cl::Coordinate ®ion,
245 size_t srcRowPitch,
246 size_t srcSlicePitch,
247 size_t dstRowPitch,
248 size_t dstSlicePitch,
249 const cl::EventPtrs &waitEvents,
250 CLEventImpl::CreateFunc *eventCreateFunc)
251 {
252 UNIMPLEMENTED();
253 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
254 }
255
enqueueFillBuffer(const cl::Buffer & buffer,const void * pattern,size_t patternSize,size_t offset,size_t size,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)256 angle::Result CLCommandQueueVk::enqueueFillBuffer(const cl::Buffer &buffer,
257 const void *pattern,
258 size_t patternSize,
259 size_t offset,
260 size_t size,
261 const cl::EventPtrs &waitEvents,
262 CLEventImpl::CreateFunc *eventCreateFunc)
263 {
264 UNIMPLEMENTED();
265 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
266 }
267
enqueueMapBuffer(const cl::Buffer & buffer,bool blocking,cl::MapFlags mapFlags,size_t offset,size_t size,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc,void * & mapPtr)268 angle::Result CLCommandQueueVk::enqueueMapBuffer(const cl::Buffer &buffer,
269 bool blocking,
270 cl::MapFlags mapFlags,
271 size_t offset,
272 size_t size,
273 const cl::EventPtrs &waitEvents,
274 CLEventImpl::CreateFunc *eventCreateFunc,
275 void *&mapPtr)
276 {
277 UNIMPLEMENTED();
278 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
279 }
280
enqueueReadImage(const cl::Image & image,bool blocking,const cl::MemOffsets & origin,const cl::Coordinate & region,size_t rowPitch,size_t slicePitch,void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)281 angle::Result CLCommandQueueVk::enqueueReadImage(const cl::Image &image,
282 bool blocking,
283 const cl::MemOffsets &origin,
284 const cl::Coordinate ®ion,
285 size_t rowPitch,
286 size_t slicePitch,
287 void *ptr,
288 const cl::EventPtrs &waitEvents,
289 CLEventImpl::CreateFunc *eventCreateFunc)
290 {
291 UNIMPLEMENTED();
292 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
293 }
294
enqueueWriteImage(const cl::Image & image,bool blocking,const cl::MemOffsets & origin,const cl::Coordinate & region,size_t inputRowPitch,size_t inputSlicePitch,const void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)295 angle::Result CLCommandQueueVk::enqueueWriteImage(const cl::Image &image,
296 bool blocking,
297 const cl::MemOffsets &origin,
298 const cl::Coordinate ®ion,
299 size_t inputRowPitch,
300 size_t inputSlicePitch,
301 const void *ptr,
302 const cl::EventPtrs &waitEvents,
303 CLEventImpl::CreateFunc *eventCreateFunc)
304 {
305 UNIMPLEMENTED();
306 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
307 }
308
enqueueCopyImage(const cl::Image & srcImage,const cl::Image & dstImage,const cl::MemOffsets & srcOrigin,const cl::MemOffsets & dstOrigin,const cl::Coordinate & region,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)309 angle::Result CLCommandQueueVk::enqueueCopyImage(const cl::Image &srcImage,
310 const cl::Image &dstImage,
311 const cl::MemOffsets &srcOrigin,
312 const cl::MemOffsets &dstOrigin,
313 const cl::Coordinate ®ion,
314 const cl::EventPtrs &waitEvents,
315 CLEventImpl::CreateFunc *eventCreateFunc)
316 {
317 UNIMPLEMENTED();
318 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
319 }
320
enqueueFillImage(const cl::Image & image,const void * fillColor,const cl::MemOffsets & origin,const cl::Coordinate & region,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)321 angle::Result CLCommandQueueVk::enqueueFillImage(const cl::Image &image,
322 const void *fillColor,
323 const cl::MemOffsets &origin,
324 const cl::Coordinate ®ion,
325 const cl::EventPtrs &waitEvents,
326 CLEventImpl::CreateFunc *eventCreateFunc)
327 {
328 UNIMPLEMENTED();
329 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
330 }
331
enqueueCopyImageToBuffer(const cl::Image & srcImage,const cl::Buffer & dstBuffer,const cl::MemOffsets & srcOrigin,const cl::Coordinate & region,size_t dstOffset,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)332 angle::Result CLCommandQueueVk::enqueueCopyImageToBuffer(const cl::Image &srcImage,
333 const cl::Buffer &dstBuffer,
334 const cl::MemOffsets &srcOrigin,
335 const cl::Coordinate ®ion,
336 size_t dstOffset,
337 const cl::EventPtrs &waitEvents,
338 CLEventImpl::CreateFunc *eventCreateFunc)
339 {
340 UNIMPLEMENTED();
341 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
342 }
343
enqueueCopyBufferToImage(const cl::Buffer & srcBuffer,const cl::Image & dstImage,size_t srcOffset,const cl::MemOffsets & dstOrigin,const cl::Coordinate & region,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)344 angle::Result CLCommandQueueVk::enqueueCopyBufferToImage(const cl::Buffer &srcBuffer,
345 const cl::Image &dstImage,
346 size_t srcOffset,
347 const cl::MemOffsets &dstOrigin,
348 const cl::Coordinate ®ion,
349 const cl::EventPtrs &waitEvents,
350 CLEventImpl::CreateFunc *eventCreateFunc)
351 {
352 UNIMPLEMENTED();
353 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
354 }
355
enqueueMapImage(const cl::Image & image,bool blocking,cl::MapFlags mapFlags,const cl::MemOffsets & origin,const cl::Coordinate & region,size_t * imageRowPitch,size_t * imageSlicePitch,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc,void * & mapPtr)356 angle::Result CLCommandQueueVk::enqueueMapImage(const cl::Image &image,
357 bool blocking,
358 cl::MapFlags mapFlags,
359 const cl::MemOffsets &origin,
360 const cl::Coordinate ®ion,
361 size_t *imageRowPitch,
362 size_t *imageSlicePitch,
363 const cl::EventPtrs &waitEvents,
364 CLEventImpl::CreateFunc *eventCreateFunc,
365 void *&mapPtr)
366 {
367 UNIMPLEMENTED();
368 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
369 }
370
enqueueUnmapMemObject(const cl::Memory & memory,void * mappedPtr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)371 angle::Result CLCommandQueueVk::enqueueUnmapMemObject(const cl::Memory &memory,
372 void *mappedPtr,
373 const cl::EventPtrs &waitEvents,
374 CLEventImpl::CreateFunc *eventCreateFunc)
375 {
376 UNIMPLEMENTED();
377 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
378 }
379
enqueueMigrateMemObjects(const cl::MemoryPtrs & memObjects,cl::MemMigrationFlags flags,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)380 angle::Result CLCommandQueueVk::enqueueMigrateMemObjects(const cl::MemoryPtrs &memObjects,
381 cl::MemMigrationFlags flags,
382 const cl::EventPtrs &waitEvents,
383 CLEventImpl::CreateFunc *eventCreateFunc)
384 {
385 UNIMPLEMENTED();
386 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
387 }
388
enqueueNDRangeKernel(const cl::Kernel & kernel,const cl::NDRange & ndrange,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)389 angle::Result CLCommandQueueVk::enqueueNDRangeKernel(const cl::Kernel &kernel,
390 const cl::NDRange &ndrange,
391 const cl::EventPtrs &waitEvents,
392 CLEventImpl::CreateFunc *eventCreateFunc)
393 {
394 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
395
396 ANGLE_TRY(processWaitlist(waitEvents));
397
398 cl::WorkgroupCount workgroupCount;
399 vk::PipelineCacheAccess pipelineCache;
400 vk::PipelineHelper *pipelineHelper = nullptr;
401 CLKernelVk &kernelImpl = kernel.getImpl<CLKernelVk>();
402
403 ANGLE_TRY(processKernelResources(kernelImpl, ndrange));
404
405 // Fetch or create compute pipeline (if we miss in cache)
406 ANGLE_CL_IMPL_TRY_ERROR(mContext->getRenderer()->getPipelineCache(mContext, &pipelineCache),
407 CL_OUT_OF_RESOURCES);
408 ANGLE_TRY(kernelImpl.getOrCreateComputePipeline(
409 &pipelineCache, ndrange, mCommandQueue.getDevice(), &pipelineHelper, &workgroupCount));
410
411 mComputePassCommands->retainResource(pipelineHelper);
412 mComputePassCommands->getCommandBuffer().bindComputePipeline(pipelineHelper->getPipeline());
413 mComputePassCommands->getCommandBuffer().dispatch(workgroupCount[0], workgroupCount[1],
414 workgroupCount[2]);
415
416 ANGLE_TRY(createEvent(eventCreateFunc));
417
418 return angle::Result::Continue;
419 }
420
enqueueTask(const cl::Kernel & kernel,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)421 angle::Result CLCommandQueueVk::enqueueTask(const cl::Kernel &kernel,
422 const cl::EventPtrs &waitEvents,
423 CLEventImpl::CreateFunc *eventCreateFunc)
424 {
425 constexpr size_t globalWorkSize[3] = {1, 0, 0};
426 constexpr size_t localWorkSize[3] = {1, 0, 0};
427 cl::NDRange ndrange(1, nullptr, globalWorkSize, localWorkSize);
428 return enqueueNDRangeKernel(kernel, ndrange, waitEvents, eventCreateFunc);
429 }
430
enqueueNativeKernel(cl::UserFunc userFunc,void * args,size_t cbArgs,const cl::BufferPtrs & buffers,const std::vector<size_t> bufferPtrOffsets,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)431 angle::Result CLCommandQueueVk::enqueueNativeKernel(cl::UserFunc userFunc,
432 void *args,
433 size_t cbArgs,
434 const cl::BufferPtrs &buffers,
435 const std::vector<size_t> bufferPtrOffsets,
436 const cl::EventPtrs &waitEvents,
437 CLEventImpl::CreateFunc *eventCreateFunc)
438 {
439 UNIMPLEMENTED();
440 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
441 }
442
enqueueMarkerWithWaitList(const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)443 angle::Result CLCommandQueueVk::enqueueMarkerWithWaitList(const cl::EventPtrs &waitEvents,
444 CLEventImpl::CreateFunc *eventCreateFunc)
445 {
446 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
447
448 ANGLE_TRY(processWaitlist(waitEvents));
449 ANGLE_TRY(createEvent(eventCreateFunc));
450
451 return angle::Result::Continue;
452 }
453
enqueueMarker(CLEventImpl::CreateFunc & eventCreateFunc)454 angle::Result CLCommandQueueVk::enqueueMarker(CLEventImpl::CreateFunc &eventCreateFunc)
455 {
456 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
457
458 // This deprecated API is essentially a super-set of clEnqueueBarrier, where we also return an
459 // event object (i.e. marker) since clEnqueueBarrier does not provide this
460 VkMemoryBarrier memoryBarrier = {VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr,
461 VK_ACCESS_SHADER_WRITE_BIT,
462 VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT};
463 mComputePassCommands->getCommandBuffer().pipelineBarrier(
464 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1,
465 &memoryBarrier, 0, nullptr, 0, nullptr);
466
467 ANGLE_TRY(createEvent(&eventCreateFunc));
468
469 return angle::Result::Continue;
470 }
471
enqueueWaitForEvents(const cl::EventPtrs & events)472 angle::Result CLCommandQueueVk::enqueueWaitForEvents(const cl::EventPtrs &events)
473 {
474 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
475
476 // Unlike clWaitForEvents, this routine is non-blocking
477 ANGLE_TRY(processWaitlist(events));
478
479 return angle::Result::Continue;
480 }
481
enqueueBarrierWithWaitList(const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)482 angle::Result CLCommandQueueVk::enqueueBarrierWithWaitList(const cl::EventPtrs &waitEvents,
483 CLEventImpl::CreateFunc *eventCreateFunc)
484 {
485 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
486
487 // The barrier command either waits for a list of events to complete, or if the list is empty it
488 // waits for all commands previously enqueued in command_queue to complete before it completes
489 if (waitEvents.empty())
490 {
491 VkMemoryBarrier memoryBarrier = {VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr,
492 VK_ACCESS_SHADER_WRITE_BIT,
493 VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT};
494 mComputePassCommands->getCommandBuffer().pipelineBarrier(
495 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1,
496 &memoryBarrier, 0, nullptr, 0, nullptr);
497 }
498 else
499 {
500 ANGLE_TRY(processWaitlist(waitEvents));
501 }
502
503 ANGLE_TRY(createEvent(eventCreateFunc));
504
505 return angle::Result::Continue;
506 }
507
enqueueBarrier()508 angle::Result CLCommandQueueVk::enqueueBarrier()
509 {
510 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
511
512 VkMemoryBarrier memoryBarrier = {VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr,
513 VK_ACCESS_SHADER_WRITE_BIT,
514 VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT};
515 mComputePassCommands->getCommandBuffer().pipelineBarrier(
516 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1,
517 &memoryBarrier, 0, nullptr, 0, nullptr);
518
519 return angle::Result::Continue;
520 }
521
flush()522 angle::Result CLCommandQueueVk::flush()
523 {
524 ANGLE_TRACE_EVENT0("gpu.angle", "CLCommandQueueVk::flush");
525
526 // Non-blocking finish
527 // TODO: Ideally we should try to find better impl. to avoid spawning a submit-thread/Task here
528 // https://anglebug.com/8669
529 std::shared_ptr<angle::WaitableEvent> asyncEvent =
530 getPlatform()->postMultiThreadWorkerTask(std::make_shared<CLAsyncFinishTask>(this));
531 ASSERT(asyncEvent != nullptr);
532
533 return angle::Result::Continue;
534 }
535
finish()536 angle::Result CLCommandQueueVk::finish()
537 {
538 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
539
540 ANGLE_TRACE_EVENT0("gpu.angle", "CLCommandQueueVk::finish");
541
542 // Blocking finish
543 return finishInternal();
544 }
545
syncHostBuffers()546 angle::Result CLCommandQueueVk::syncHostBuffers()
547 {
548 for (const cl::MemoryPtr &memoryPtr : mHostBufferUpdateList)
549 {
550 ASSERT(memoryPtr->getHostPtr() != nullptr);
551 CLBufferVk &bufferVk = memoryPtr->getImpl<CLBufferVk>();
552 ANGLE_TRY(
553 bufferVk.copyTo(memoryPtr->getHostPtr(), memoryPtr->getOffset(), memoryPtr->getSize()));
554 }
555 mHostBufferUpdateList.clear();
556
557 return angle::Result::Continue;
558 }
559
processKernelResources(CLKernelVk & kernelVk,const cl::NDRange & ndrange)560 angle::Result CLCommandQueueVk::processKernelResources(CLKernelVk &kernelVk,
561 const cl::NDRange &ndrange)
562 {
563 bool needsBarrier = false;
564 UpdateDescriptorSetsBuilder updateDescriptorSetsBuilder;
565 const CLProgramVk::DeviceProgramData *devProgramData =
566 kernelVk.getProgram()->getDeviceProgramData(mCommandQueue.getDevice().getNative());
567 ASSERT(devProgramData != nullptr);
568
569 // Allocate descriptor set
570 VkDescriptorSet descriptorSet{VK_NULL_HANDLE};
571 ANGLE_TRY(kernelVk.getProgram()->allocateDescriptorSet(
572 kernelVk.getDescriptorSetLayouts()[DescriptorSetIndex::ShaderResource].get(),
573 &descriptorSet));
574
575 // Push global offset data
576 const VkPushConstantRange *globalOffsetRange = devProgramData->getGlobalOffsetRange();
577 if (globalOffsetRange != nullptr)
578 {
579 mComputePassCommands->getCommandBuffer().pushConstants(
580 kernelVk.getPipelineLayout().get(), VK_SHADER_STAGE_COMPUTE_BIT,
581 globalOffsetRange->offset, globalOffsetRange->size, ndrange.globalWorkOffset.data());
582 }
583
584 // Push global size data
585 const VkPushConstantRange *globalSizeRange = devProgramData->getGlobalSizeRange();
586 if (globalSizeRange != nullptr)
587 {
588 mComputePassCommands->getCommandBuffer().pushConstants(
589 kernelVk.getPipelineLayout().get(), VK_SHADER_STAGE_COMPUTE_BIT,
590 globalSizeRange->offset, globalSizeRange->size, ndrange.globalWorkSize.data());
591 }
592
593 // Process each kernel argument/resource
594 for (const auto &arg : kernelVk.getArgs())
595 {
596 switch (arg.type)
597 {
598 case NonSemanticClspvReflectionArgumentUniform:
599 case NonSemanticClspvReflectionArgumentStorageBuffer:
600 {
601 cl::Memory *clMem = cl::Buffer::Cast(*static_cast<const cl_mem *>(arg.handle));
602 CLBufferVk &vkMem = clMem->getImpl<CLBufferVk>();
603
604 // Retain this resource until its associated dispatch completes
605 mMemoryCaptures.emplace_back(clMem);
606
607 // Handle possible resource RAW hazard
608 if (arg.type != NonSemanticClspvReflectionArgumentUniform)
609 {
610 if (mDependencyTracker.contains(clMem) ||
611 mDependencyTracker.size() == kMaxDependencyTrackerSize)
612 {
613 needsBarrier = true;
614 mDependencyTracker.clear();
615 }
616 mDependencyTracker.insert(clMem);
617 }
618
619 // Update buffer/descriptor info
620 VkDescriptorBufferInfo &bufferInfo =
621 updateDescriptorSetsBuilder.allocDescriptorBufferInfo();
622 bufferInfo.range = clMem->getSize();
623 bufferInfo.offset = clMem->getOffset();
624 bufferInfo.buffer = vkMem.isSubBuffer()
625 ? vkMem.getParent()->getBuffer().getBuffer().getHandle()
626 : vkMem.getBuffer().getBuffer().getHandle();
627 VkWriteDescriptorSet &writeDescriptorSet =
628 updateDescriptorSetsBuilder.allocWriteDescriptorSet();
629 writeDescriptorSet.descriptorCount = 1;
630 writeDescriptorSet.descriptorType =
631 arg.type == NonSemanticClspvReflectionArgumentUniform
632 ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER
633 : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
634 writeDescriptorSet.pBufferInfo = &bufferInfo;
635 writeDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
636 writeDescriptorSet.dstSet = descriptorSet;
637 writeDescriptorSet.dstBinding = arg.descriptorBinding;
638 break;
639 }
640 case NonSemanticClspvReflectionArgumentPodPushConstant:
641 {
642 mComputePassCommands->getCommandBuffer().pushConstants(
643 kernelVk.getPipelineLayout().get(), VK_SHADER_STAGE_COMPUTE_BIT,
644 arg.pushConstOffset, arg.pushConstantSize, arg.handle);
645 break;
646 }
647 case NonSemanticClspvReflectionArgumentSampler:
648 case NonSemanticClspvReflectionArgumentPodUniform:
649 case NonSemanticClspvReflectionArgumentStorageImage:
650 case NonSemanticClspvReflectionArgumentSampledImage:
651 case NonSemanticClspvReflectionArgumentPointerUniform:
652 case NonSemanticClspvReflectionArgumentPodStorageBuffer:
653 case NonSemanticClspvReflectionArgumentUniformTexelBuffer:
654 case NonSemanticClspvReflectionArgumentStorageTexelBuffer:
655 case NonSemanticClspvReflectionArgumentPointerPushConstant:
656 default:
657 {
658 UNIMPLEMENTED();
659 break;
660 }
661 }
662 }
663
664 if (needsBarrier)
665 {
666 VkMemoryBarrier memoryBarrier = {VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr,
667 VK_ACCESS_SHADER_WRITE_BIT,
668 VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT};
669 mComputePassCommands->getCommandBuffer().pipelineBarrier(
670 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1,
671 &memoryBarrier, 0, nullptr, 0, nullptr);
672 }
673
674 mContext->getPerfCounters().writeDescriptorSets =
675 updateDescriptorSetsBuilder.flushDescriptorSetUpdates(mContext->getRenderer()->getDevice());
676
677 mComputePassCommands->getCommandBuffer().bindDescriptorSets(
678 kernelVk.getPipelineLayout().get(), VK_PIPELINE_BIND_POINT_COMPUTE,
679 DescriptorSetIndex::Internal, 1, &descriptorSet, 0, nullptr);
680
681 return angle::Result::Continue;
682 }
683
flushComputePassCommands()684 angle::Result CLCommandQueueVk::flushComputePassCommands()
685 {
686 mLastFlushedQueueSerial = mComputePassCommands->getQueueSerial();
687
688 // Here, we flush our compute cmds to RendererVk's primary command buffer
689 ANGLE_TRY(mContext->getRenderer()->flushOutsideRPCommands(
690 mContext, getProtectionType(), egl::ContextPriority::Medium, &mComputePassCommands));
691
692 mHasAnyCommandsPendingSubmission = true;
693
694 mContext->getPerfCounters().flushedOutsideRenderPassCommandBuffers++;
695
696 // Generate new serial for next batch of cmds
697 mComputePassCommands->setQueueSerial(
698 mCurrentQueueSerialIndex,
699 mContext->getRenderer()->generateQueueSerial(mCurrentQueueSerialIndex));
700
701 return angle::Result::Continue;
702 }
703
processWaitlist(const cl::EventPtrs & waitEvents)704 angle::Result CLCommandQueueVk::processWaitlist(const cl::EventPtrs &waitEvents)
705 {
706 if (!waitEvents.empty())
707 {
708 bool insertedBarrier = false;
709 for (const cl::EventPtr &event : waitEvents)
710 {
711 if (event->getImpl<CLEventVk>().isUserEvent() ||
712 event->getCommandQueue() != &mCommandQueue)
713 {
714 // We cannot use a barrier in these cases, therefore defer the event
715 // handling till submission time
716 // TODO: Perhaps we could utilize VkEvents here instead and have GPU wait(s)
717 // https://anglebug.com/8670
718 mDependantEvents.push_back(event);
719 }
720 else if (event->getCommandQueue() == &mCommandQueue && !insertedBarrier)
721 {
722 // As long as there is at least one dependant command in same queue,
723 // we just need to insert one execution barrier
724 VkMemoryBarrier memoryBarrier = {
725 VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr, VK_ACCESS_SHADER_WRITE_BIT,
726 VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT};
727 mComputePassCommands->getCommandBuffer().pipelineBarrier(
728 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0,
729 1, &memoryBarrier, 0, nullptr, 0, nullptr);
730
731 insertedBarrier = true;
732 }
733 }
734 }
735 return angle::Result::Continue;
736 }
737
submitCommands()738 angle::Result CLCommandQueueVk::submitCommands()
739 {
740 ANGLE_TRACE_EVENT0("gpu.angle", "CLCommandQueueVk::submitCommands()");
741
742 // Kick off renderer submit
743 ANGLE_TRY(mContext->getRenderer()->submitCommands(mContext, getProtectionType(),
744 egl::ContextPriority::Medium, nullptr,
745 nullptr, mLastFlushedQueueSerial));
746
747 mLastSubmittedQueueSerial = mLastFlushedQueueSerial;
748
749 // Now that we have submitted commands, some of pending garbage may no longer pending
750 // and should be moved to garbage list.
751 mContext->getRenderer()->cleanupPendingSubmissionGarbage();
752
753 mHasAnyCommandsPendingSubmission = false;
754
755 return angle::Result::Continue;
756 }
757
createEvent(CLEventImpl::CreateFunc * createFunc)758 angle::Result CLCommandQueueVk::createEvent(CLEventImpl::CreateFunc *createFunc)
759 {
760 if (createFunc != nullptr)
761 {
762 *createFunc = [this](const cl::Event &event) {
763 auto eventVk = new (std::nothrow) CLEventVk(event);
764 if (eventVk == nullptr)
765 {
766 ERR() << "Failed to create event obj!";
767 ANGLE_CL_SET_ERROR(CL_OUT_OF_HOST_MEMORY);
768 return CLEventImpl::Ptr(nullptr);
769 }
770 eventVk->setQueueSerial(mComputePassCommands->getQueueSerial());
771
772 // Save a reference to this event
773 mAssociatedEvents.push_back(cl::EventPtr{&eventVk->getFrontendObject()});
774
775 return CLEventImpl::Ptr(eventVk);
776 };
777 }
778 return angle::Result::Continue;
779 }
780
finishInternal()781 angle::Result CLCommandQueueVk::finishInternal()
782 {
783 for (cl::EventPtr event : mAssociatedEvents)
784 {
785 ANGLE_TRY(event->getImpl<CLEventVk>().setStatusAndExecuteCallback(CL_SUBMITTED));
786 }
787
788 if (!mComputePassCommands->empty())
789 {
790 // If we still have dependant events, handle them now
791 if (!mDependantEvents.empty())
792 {
793 for (const auto &depEvent : mDependantEvents)
794 {
795 if (depEvent->getImpl<CLEventVk>().isUserEvent())
796 {
797 // We just wait here for user to set the event object
798 cl_int status = CL_QUEUED;
799 ANGLE_TRY(depEvent->getImpl<CLEventVk>().waitForUserEventStatus());
800 ANGLE_TRY(depEvent->getImpl<CLEventVk>().getCommandExecutionStatus(status));
801 if (status < 0)
802 {
803 ERR() << "Invalid dependant user-event (" << depEvent.get()
804 << ") status encountered!";
805 mComputePassCommands->getCommandBuffer().reset();
806 ANGLE_CL_RETURN_ERROR(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
807 }
808 }
809 else
810 {
811 // Otherwise, we just need to submit/finish for dependant event queues
812 // here that are not associated with this queue
813 ANGLE_TRY(depEvent->getCommandQueue()->finish());
814 }
815 }
816 mDependantEvents.clear();
817 }
818
819 ANGLE_TRY(flushComputePassCommands());
820 }
821
822 for (cl::EventPtr event : mAssociatedEvents)
823 {
824 ANGLE_TRY(event->getImpl<CLEventVk>().setStatusAndExecuteCallback(CL_RUNNING));
825 }
826
827 if (mHasAnyCommandsPendingSubmission)
828 {
829 // Submit and wait for fence
830 ANGLE_TRY(submitCommands());
831 ANGLE_TRY(mContext->getRenderer()->finishQueueSerial(mContext, mLastSubmittedQueueSerial));
832
833 // Ensure any resources are synced back to host on GPU completion
834 ANGLE_TRY(syncHostBuffers());
835 }
836
837 for (cl::EventPtr event : mAssociatedEvents)
838 {
839 ANGLE_TRY(event->getImpl<CLEventVk>().setStatusAndExecuteCallback(CL_COMPLETE));
840 }
841
842 mMemoryCaptures.clear();
843 mAssociatedEvents.clear();
844 mDependencyTracker.clear();
845
846 return angle::Result::Continue;
847 }
848
849 } // namespace rx
850