1 //
2 // Copyright 2021 The ANGLE Project Authors. All rights reserved.
3 // Use of this source code is governed by a BSD-style license that can be
4 // found in the LICENSE file.
5 //
6 // CLKernelVk.cpp: Implements the class methods for CLKernelVk.
7
8 #include "common/PackedEnums.h"
9
10 #include "libANGLE/renderer/vulkan/CLContextVk.h"
11 #include "libANGLE/renderer/vulkan/CLDeviceVk.h"
12 #include "libANGLE/renderer/vulkan/CLKernelVk.h"
13 #include "libANGLE/renderer/vulkan/CLMemoryVk.h"
14 #include "libANGLE/renderer/vulkan/CLProgramVk.h"
15 #include "libANGLE/renderer/vulkan/vk_wrapper.h"
16
17 #include "libANGLE/CLBuffer.h"
18 #include "libANGLE/CLContext.h"
19 #include "libANGLE/CLKernel.h"
20 #include "libANGLE/CLProgram.h"
21 #include "libANGLE/cl_utils.h"
22 #include "spirv/unified1/NonSemanticClspvReflection.h"
23
24 namespace rx
25 {
26
CLKernelVk(const cl::Kernel & kernel,std::string & name,std::string & attributes,CLKernelArguments & args)27 CLKernelVk::CLKernelVk(const cl::Kernel &kernel,
28 std::string &name,
29 std::string &attributes,
30 CLKernelArguments &args)
31 : CLKernelImpl(kernel),
32 mProgram(&kernel.getProgram().getImpl<CLProgramVk>()),
33 mContext(&kernel.getProgram().getContext().getImpl<CLContextVk>()),
34 mName(name),
35 mAttributes(attributes),
36 mArgs(args),
37 mPodBuffer(nullptr)
38 {
39 mShaderProgramHelper.setShader(gl::ShaderType::Compute,
40 mKernel.getProgram().getImpl<CLProgramVk>().getShaderModule());
41 }
42
~CLKernelVk()43 CLKernelVk::~CLKernelVk()
44 {
45 mComputePipelineCache.destroy(mContext);
46 mShaderProgramHelper.destroy(mContext->getRenderer());
47
48 if (mPodBuffer)
49 {
50 // mPodBuffer assignment will make newly created buffer
51 // return refcount of 2, so need to release by 1
52 mPodBuffer->release();
53 }
54 }
55
init()56 angle::Result CLKernelVk::init()
57 {
58 const CLProgramVk::DeviceProgramData *deviceProgramData =
59 mProgram->getDeviceProgramData(mName.c_str());
60
61 // Literal sampler handling
62 for (const ClspvLiteralSampler &literalSampler :
63 deviceProgramData->reflectionData.literalSamplers)
64 {
65 mDescriptorSetLayoutDescs[DescriptorSetIndex::LiteralSampler].addBinding(
66 literalSampler.binding, VK_DESCRIPTOR_TYPE_SAMPLER, 1, VK_SHADER_STAGE_COMPUTE_BIT,
67 nullptr);
68 }
69
70 vk::DescriptorSetLayoutDesc &descriptorSetLayoutDesc =
71 mDescriptorSetLayoutDescs[DescriptorSetIndex::KernelArguments];
72 VkPushConstantRange pcRange = deviceProgramData->pushConstRange;
73 size_t podBufferSize = 0;
74
75 bool podFound = false;
76 for (const auto &arg : getArgs())
77 {
78 VkDescriptorType descType = VK_DESCRIPTOR_TYPE_MAX_ENUM;
79 switch (arg.type)
80 {
81 case NonSemanticClspvReflectionArgumentStorageBuffer:
82 descType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
83 break;
84 case NonSemanticClspvReflectionArgumentUniform:
85 case NonSemanticClspvReflectionArgumentPointerUniform:
86 descType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
87 break;
88 case NonSemanticClspvReflectionArgumentPodUniform:
89 case NonSemanticClspvReflectionArgumentPodStorageBuffer:
90 {
91 uint32_t newPodBufferSize = arg.podStorageBufferOffset + arg.podStorageBufferSize;
92 podBufferSize = newPodBufferSize > podBufferSize ? newPodBufferSize : podBufferSize;
93 if (podFound)
94 {
95 continue;
96 }
97 descType = arg.type == NonSemanticClspvReflectionArgumentPodUniform
98 ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER
99 : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
100 podFound = true;
101 break;
102 }
103 case NonSemanticClspvReflectionArgumentPodPushConstant:
104 // Get existing push constant range and see if we need to update
105 if (arg.pushConstOffset + arg.pushConstantSize > pcRange.offset + pcRange.size)
106 {
107 pcRange.size = arg.pushConstOffset + arg.pushConstantSize - pcRange.offset;
108 }
109 continue;
110 case NonSemanticClspvReflectionArgumentSampledImage:
111 descType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE;
112 break;
113 case NonSemanticClspvReflectionArgumentStorageImage:
114 descType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
115 break;
116 case NonSemanticClspvReflectionArgumentSampler:
117 descType = VK_DESCRIPTOR_TYPE_SAMPLER;
118 break;
119 case NonSemanticClspvReflectionArgumentStorageTexelBuffer:
120 descType = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER;
121 break;
122 case NonSemanticClspvReflectionArgumentUniformTexelBuffer:
123 descType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER;
124 break;
125 default:
126 continue;
127 }
128 if (descType != VK_DESCRIPTOR_TYPE_MAX_ENUM)
129 {
130 descriptorSetLayoutDesc.addBinding(arg.descriptorBinding, descType, 1,
131 VK_SHADER_STAGE_COMPUTE_BIT, nullptr);
132 }
133 }
134
135 if (podBufferSize > 0)
136 {
137 mPodBuffer =
138 cl::MemoryPtr(cl::Buffer::Cast(this->mContext->getFrontendObject().createBuffer(
139 nullptr, cl::MemFlags(CL_MEM_READ_ONLY), podBufferSize, nullptr)));
140 }
141
142 if (usesPrintf())
143 {
144 mDescriptorSetLayoutDescs[DescriptorSetIndex::Printf].addBinding(
145 deviceProgramData->reflectionData.printfBufferStorage.binding,
146 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr);
147 }
148
149 // Get pipeline layout from cache (creates if missed)
150 // A given kernel need not have resulted in use of all the descriptor sets. Unless the
151 // graphicsPipelineLibrary extension is supported, the pipeline layout need all the descriptor
152 // set layouts to be valide. So set them up in the order of their occurrence.
153 mPipelineLayoutDesc = {};
154 for (DescriptorSetIndex index : angle::AllEnums<DescriptorSetIndex>())
155 {
156 if (!mDescriptorSetLayoutDescs[index].empty())
157 {
158 mPipelineLayoutDesc.updateDescriptorSetLayout(index, mDescriptorSetLayoutDescs[index]);
159 }
160 }
161
162 // push constant setup
163 // push constant size must be multiple of 4
164 pcRange.size = roundUpPow2(pcRange.size, 4u);
165 mPodArgumentPushConstants.resize(pcRange.size);
166
167 // push constant offset must be multiple of 4, round down to ensure this
168 pcRange.offset = roundDownPow2(pcRange.offset, 4u);
169
170 mPipelineLayoutDesc.updatePushConstantRange(pcRange.stageFlags, pcRange.offset, pcRange.size);
171
172 // initialize the descriptor pools
173 // descriptor pools are setup as per their indices
174 return initializeDescriptorPools();
175 }
176
setArg(cl_uint argIndex,size_t argSize,const void * argValue)177 angle::Result CLKernelVk::setArg(cl_uint argIndex, size_t argSize, const void *argValue)
178 {
179 auto &arg = mArgs.at(argIndex);
180 if (arg.used)
181 {
182 switch (arg.type)
183 {
184 case NonSemanticClspvReflectionArgumentPodPushConstant:
185 ASSERT(mPodArgumentPushConstants.size() >=
186 arg.pushConstantSize + arg.pushConstOffset);
187 arg.handle = &mPodArgumentPushConstants[arg.pushConstOffset];
188 arg.handleSize = argSize;
189 if (argSize > 0 && argValue != nullptr)
190 {
191 // Copy the contents since app is free to delete/reassign the contents after
192 memcpy(arg.handle, argValue, arg.handleSize);
193 }
194 break;
195 case NonSemanticClspvReflectionArgumentPodUniform:
196 case NonSemanticClspvReflectionArgumentPodStorageBuffer:
197 ASSERT(mPodBuffer->getSize() >= argSize + arg.podUniformOffset);
198 if (argSize > 0 && argValue != nullptr)
199 {
200 ANGLE_TRY(mPodBuffer->getImpl<CLBufferVk>().copyFrom(
201 argValue, arg.podStorageBufferOffset, argSize));
202 }
203 break;
204 case NonSemanticClspvReflectionArgumentWorkgroup:
205 ASSERT(arg.workgroupSize != 0);
206 mSpecConstants.push_back(
207 KernelSpecConstant{.ID = arg.workgroupSpecId,
208 .data = static_cast<uint32_t>(argSize / arg.workgroupSize)});
209 break;
210 case NonSemanticClspvReflectionArgumentUniform:
211 case NonSemanticClspvReflectionArgumentStorageBuffer:
212 case NonSemanticClspvReflectionArgumentStorageImage:
213 case NonSemanticClspvReflectionArgumentSampledImage:
214 case NonSemanticClspvReflectionArgumentUniformTexelBuffer:
215 case NonSemanticClspvReflectionArgumentStorageTexelBuffer:
216 ASSERT(argSize == sizeof(cl_mem *));
217 arg.handle = *static_cast<const cl_mem *>(argValue);
218 arg.handleSize = argSize;
219 break;
220 default:
221 // Just store ptr and size (if we end up here)
222 arg.handle = const_cast<void *>(argValue);
223 arg.handleSize = argSize;
224 break;
225 }
226 }
227
228 return angle::Result::Continue;
229 }
230
createInfo(CLKernelImpl::Info * info) const231 angle::Result CLKernelVk::createInfo(CLKernelImpl::Info *info) const
232 {
233 info->functionName = mName;
234 info->attributes = mAttributes;
235 info->numArgs = static_cast<cl_uint>(mArgs.size());
236 for (const auto &arg : mArgs)
237 {
238 ArgInfo argInfo;
239 argInfo.name = arg.info.name;
240 argInfo.typeName = arg.info.typeName;
241 argInfo.accessQualifier = arg.info.accessQualifier;
242 argInfo.addressQualifier = arg.info.addressQualifier;
243 argInfo.typeQualifier = arg.info.typeQualifier;
244 info->args.push_back(std::move(argInfo));
245 }
246
247 auto &ctx = mKernel.getProgram().getContext();
248 info->workGroups.resize(ctx.getDevices().size());
249 const CLProgramVk::DeviceProgramData *deviceProgramData = nullptr;
250 for (auto i = 0u; i < ctx.getDevices().size(); ++i)
251 {
252 auto &workGroup = info->workGroups[i];
253 const auto deviceVk = &ctx.getDevices()[i]->getImpl<CLDeviceVk>();
254 deviceProgramData = mProgram->getDeviceProgramData(ctx.getDevices()[i]->getNative());
255 if (deviceProgramData == nullptr)
256 {
257 continue;
258 }
259
260 // TODO: http://anglebug.com/42267005
261 ANGLE_TRY(
262 deviceVk->getInfoSizeT(cl::DeviceInfo::MaxWorkGroupSize, &workGroup.workGroupSize));
263
264 // TODO: http://anglebug.com/42267004
265 workGroup.privateMemSize = 0;
266 workGroup.localMemSize = 0;
267
268 workGroup.prefWorkGroupSizeMultiple = 16u;
269 workGroup.globalWorkSize = {0, 0, 0};
270 if (deviceProgramData->reflectionData.kernelCompileWorkgroupSize.contains(mName))
271 {
272 workGroup.compileWorkGroupSize = {
273 deviceProgramData->reflectionData.kernelCompileWorkgroupSize.at(mName)[0],
274 deviceProgramData->reflectionData.kernelCompileWorkgroupSize.at(mName)[1],
275 deviceProgramData->reflectionData.kernelCompileWorkgroupSize.at(mName)[2]};
276 }
277 else
278 {
279 workGroup.compileWorkGroupSize = {0, 0, 0};
280 }
281 }
282
283 return angle::Result::Continue;
284 }
285
initPipelineLayout()286 angle::Result CLKernelVk::initPipelineLayout()
287 {
288 PipelineLayoutCache *pipelineLayoutCache = mContext->getPipelineLayoutCache();
289 return pipelineLayoutCache->getPipelineLayout(mContext, mPipelineLayoutDesc,
290 mDescriptorSetLayouts, &mPipelineLayout);
291 }
292
getOrCreateComputePipeline(vk::PipelineCacheAccess * pipelineCache,const cl::NDRange & ndrange,const cl::Device & device,vk::PipelineHelper ** pipelineOut)293 angle::Result CLKernelVk::getOrCreateComputePipeline(vk::PipelineCacheAccess *pipelineCache,
294 const cl::NDRange &ndrange,
295 const cl::Device &device,
296 vk::PipelineHelper **pipelineOut)
297 {
298 const CLProgramVk::DeviceProgramData *devProgramData =
299 getProgram()->getDeviceProgramData(device.getNative());
300 ASSERT(devProgramData != nullptr);
301
302 // Populate program specialization constants (if any)
303 uint32_t constantDataOffset = 0;
304 std::vector<uint32_t> specConstantData;
305 std::vector<VkSpecializationMapEntry> mapEntries;
306 for (const auto specConstantUsed : devProgramData->reflectionData.specConstantsUsed)
307 {
308 switch (specConstantUsed)
309 {
310 case SpecConstantType::WorkDimension:
311 specConstantData.push_back(ndrange.workDimensions);
312 break;
313 case SpecConstantType::WorkgroupSizeX:
314 specConstantData.push_back(ndrange.localWorkSize[0]);
315 break;
316 case SpecConstantType::WorkgroupSizeY:
317 specConstantData.push_back(ndrange.localWorkSize[1]);
318 break;
319 case SpecConstantType::WorkgroupSizeZ:
320 specConstantData.push_back(ndrange.localWorkSize[2]);
321 break;
322 case SpecConstantType::GlobalOffsetX:
323 specConstantData.push_back(ndrange.globalWorkOffset[0]);
324 break;
325 case SpecConstantType::GlobalOffsetY:
326 specConstantData.push_back(ndrange.globalWorkOffset[1]);
327 break;
328 case SpecConstantType::GlobalOffsetZ:
329 specConstantData.push_back(ndrange.globalWorkOffset[2]);
330 break;
331 default:
332 UNIMPLEMENTED();
333 continue;
334 }
335 mapEntries.push_back(VkSpecializationMapEntry{
336 .constantID = devProgramData->reflectionData.specConstantIDs[specConstantUsed],
337 .offset = constantDataOffset,
338 .size = sizeof(uint32_t)});
339 constantDataOffset += sizeof(uint32_t);
340 }
341 // Populate kernel specialization constants (if any)
342 for (const auto &specConstant : mSpecConstants)
343 {
344 specConstantData.push_back(specConstant.data);
345 mapEntries.push_back(VkSpecializationMapEntry{
346 .constantID = specConstant.ID, .offset = constantDataOffset, .size = sizeof(uint32_t)});
347 constantDataOffset += sizeof(uint32_t);
348 }
349 VkSpecializationInfo computeSpecializationInfo{
350 .mapEntryCount = static_cast<uint32_t>(mapEntries.size()),
351 .pMapEntries = mapEntries.data(),
352 .dataSize = specConstantData.size() * sizeof(uint32_t),
353 .pData = specConstantData.data(),
354 };
355
356 // Now get or create (on compute pipeline cache miss) compute pipeline and return it
357 vk::ComputePipelineOptions options = vk::GetComputePipelineOptions(
358 vk::PipelineRobustness::NonRobust, vk::PipelineProtectedAccess::Unprotected);
359 return mShaderProgramHelper.getOrCreateComputePipeline(
360 mContext, &mComputePipelineCache, pipelineCache, getPipelineLayout(), options,
361 PipelineSource::Draw, pipelineOut, mName.c_str(), &computeSpecializationInfo);
362 }
363
usesPrintf() const364 bool CLKernelVk::usesPrintf() const
365 {
366 return mProgram->getDeviceProgramData(mName.c_str())->getKernelFlags(mName) &
367 NonSemanticClspvReflectionMayUsePrintf;
368 }
369
initializeDescriptorPools()370 angle::Result CLKernelVk::initializeDescriptorPools()
371 {
372 for (DescriptorSetIndex index : angle::AllEnums<DescriptorSetIndex>())
373 {
374 if (!mDescriptorSetLayoutDescs[index].empty())
375 {
376 ANGLE_TRY(mContext->getMetaDescriptorPool().bindCachedDescriptorPool(
377 mContext, mDescriptorSetLayoutDescs[index], 1,
378 mContext->getDescriptorSetLayoutCache(), &mDynamicDescriptorPools[index]));
379 }
380 }
381 return angle::Result::Continue;
382 }
383
allocateDescriptorSet(DescriptorSetIndex index,angle::EnumIterator<DescriptorSetIndex> layoutIndex,vk::OutsideRenderPassCommandBufferHelper * computePassCommands)384 angle::Result CLKernelVk::allocateDescriptorSet(
385 DescriptorSetIndex index,
386 angle::EnumIterator<DescriptorSetIndex> layoutIndex,
387 vk::OutsideRenderPassCommandBufferHelper *computePassCommands)
388 {
389 if (mDescriptorSets[index] && mDescriptorSets[index]->valid())
390 {
391 if (mDescriptorSets[index]->usedByCommandBuffer(computePassCommands->getQueueSerial()))
392 {
393 mDescriptorSets[index].reset();
394 }
395 else
396 {
397 return angle::Result::Continue;
398 }
399 }
400
401 if (mDynamicDescriptorPools[index]->valid())
402 {
403 ANGLE_TRY(mDynamicDescriptorPools[index]->allocateDescriptorSet(
404 mContext, *mDescriptorSetLayouts[*layoutIndex], &mDescriptorSets[index]));
405 computePassCommands->retainResource(mDescriptorSets[index].get());
406 }
407
408 return angle::Result::Continue;
409 }
410 } // namespace rx
411