// // Copyright 2021 The ANGLE Project Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // // CLKernelVk.cpp: Implements the class methods for CLKernelVk. #include "common/PackedEnums.h" #include "libANGLE/renderer/vulkan/CLContextVk.h" #include "libANGLE/renderer/vulkan/CLDeviceVk.h" #include "libANGLE/renderer/vulkan/CLKernelVk.h" #include "libANGLE/renderer/vulkan/CLMemoryVk.h" #include "libANGLE/renderer/vulkan/CLProgramVk.h" #include "libANGLE/renderer/vulkan/vk_wrapper.h" #include "libANGLE/CLBuffer.h" #include "libANGLE/CLContext.h" #include "libANGLE/CLKernel.h" #include "libANGLE/CLProgram.h" #include "libANGLE/cl_utils.h" #include "spirv/unified1/NonSemanticClspvReflection.h" namespace rx { CLKernelVk::CLKernelVk(const cl::Kernel &kernel, std::string &name, std::string &attributes, CLKernelArguments &args) : CLKernelImpl(kernel), mProgram(&kernel.getProgram().getImpl()), mContext(&kernel.getProgram().getContext().getImpl()), mName(name), mAttributes(attributes), mArgs(args), mPodBuffer(nullptr) { mShaderProgramHelper.setShader(gl::ShaderType::Compute, mKernel.getProgram().getImpl().getShaderModule()); } CLKernelVk::~CLKernelVk() { mComputePipelineCache.destroy(mContext); mShaderProgramHelper.destroy(mContext->getRenderer()); if (mPodBuffer) { // mPodBuffer assignment will make newly created buffer // return refcount of 2, so need to release by 1 mPodBuffer->release(); } } angle::Result CLKernelVk::init() { const CLProgramVk::DeviceProgramData *deviceProgramData = mProgram->getDeviceProgramData(mName.c_str()); // Literal sampler handling for (const ClspvLiteralSampler &literalSampler : deviceProgramData->reflectionData.literalSamplers) { mDescriptorSetLayoutDescs[DescriptorSetIndex::LiteralSampler].addBinding( literalSampler.binding, VK_DESCRIPTOR_TYPE_SAMPLER, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr); } vk::DescriptorSetLayoutDesc &descriptorSetLayoutDesc = mDescriptorSetLayoutDescs[DescriptorSetIndex::KernelArguments]; VkPushConstantRange pcRange = deviceProgramData->pushConstRange; size_t podBufferSize = 0; bool podFound = false; for (const auto &arg : getArgs()) { VkDescriptorType descType = VK_DESCRIPTOR_TYPE_MAX_ENUM; switch (arg.type) { case NonSemanticClspvReflectionArgumentStorageBuffer: descType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; break; case NonSemanticClspvReflectionArgumentUniform: case NonSemanticClspvReflectionArgumentPointerUniform: descType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; break; case NonSemanticClspvReflectionArgumentPodUniform: case NonSemanticClspvReflectionArgumentPodStorageBuffer: { uint32_t newPodBufferSize = arg.podStorageBufferOffset + arg.podStorageBufferSize; podBufferSize = newPodBufferSize > podBufferSize ? newPodBufferSize : podBufferSize; if (podFound) { continue; } descType = arg.type == NonSemanticClspvReflectionArgumentPodUniform ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; podFound = true; break; } case NonSemanticClspvReflectionArgumentPodPushConstant: // Get existing push constant range and see if we need to update if (arg.pushConstOffset + arg.pushConstantSize > pcRange.offset + pcRange.size) { pcRange.size = arg.pushConstOffset + arg.pushConstantSize - pcRange.offset; } continue; case NonSemanticClspvReflectionArgumentSampledImage: descType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; break; case NonSemanticClspvReflectionArgumentStorageImage: descType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; break; case NonSemanticClspvReflectionArgumentSampler: descType = VK_DESCRIPTOR_TYPE_SAMPLER; break; case NonSemanticClspvReflectionArgumentStorageTexelBuffer: descType = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER; break; case NonSemanticClspvReflectionArgumentUniformTexelBuffer: descType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER; break; default: continue; } if (descType != VK_DESCRIPTOR_TYPE_MAX_ENUM) { descriptorSetLayoutDesc.addBinding(arg.descriptorBinding, descType, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr); } } if (podBufferSize > 0) { mPodBuffer = cl::MemoryPtr(cl::Buffer::Cast(this->mContext->getFrontendObject().createBuffer( nullptr, cl::MemFlags(CL_MEM_READ_ONLY), podBufferSize, nullptr))); } if (usesPrintf()) { mDescriptorSetLayoutDescs[DescriptorSetIndex::Printf].addBinding( deviceProgramData->reflectionData.printfBufferStorage.binding, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr); } // Get pipeline layout from cache (creates if missed) // A given kernel need not have resulted in use of all the descriptor sets. Unless the // graphicsPipelineLibrary extension is supported, the pipeline layout need all the descriptor // set layouts to be valide. So set them up in the order of their occurrence. mPipelineLayoutDesc = {}; for (DescriptorSetIndex index : angle::AllEnums()) { if (!mDescriptorSetLayoutDescs[index].empty()) { mPipelineLayoutDesc.updateDescriptorSetLayout(index, mDescriptorSetLayoutDescs[index]); } } // push constant setup // push constant size must be multiple of 4 pcRange.size = roundUpPow2(pcRange.size, 4u); mPodArgumentPushConstants.resize(pcRange.size); // push constant offset must be multiple of 4, round down to ensure this pcRange.offset = roundDownPow2(pcRange.offset, 4u); mPipelineLayoutDesc.updatePushConstantRange(pcRange.stageFlags, pcRange.offset, pcRange.size); // initialize the descriptor pools // descriptor pools are setup as per their indices return initializeDescriptorPools(); } angle::Result CLKernelVk::setArg(cl_uint argIndex, size_t argSize, const void *argValue) { auto &arg = mArgs.at(argIndex); if (arg.used) { switch (arg.type) { case NonSemanticClspvReflectionArgumentPodPushConstant: ASSERT(mPodArgumentPushConstants.size() >= arg.pushConstantSize + arg.pushConstOffset); arg.handle = &mPodArgumentPushConstants[arg.pushConstOffset]; arg.handleSize = argSize; if (argSize > 0 && argValue != nullptr) { // Copy the contents since app is free to delete/reassign the contents after memcpy(arg.handle, argValue, arg.handleSize); } break; case NonSemanticClspvReflectionArgumentPodUniform: case NonSemanticClspvReflectionArgumentPodStorageBuffer: ASSERT(mPodBuffer->getSize() >= argSize + arg.podUniformOffset); if (argSize > 0 && argValue != nullptr) { ANGLE_TRY(mPodBuffer->getImpl().copyFrom( argValue, arg.podStorageBufferOffset, argSize)); } break; case NonSemanticClspvReflectionArgumentWorkgroup: ASSERT(arg.workgroupSize != 0); mSpecConstants.push_back( KernelSpecConstant{.ID = arg.workgroupSpecId, .data = static_cast(argSize / arg.workgroupSize)}); break; case NonSemanticClspvReflectionArgumentUniform: case NonSemanticClspvReflectionArgumentStorageBuffer: case NonSemanticClspvReflectionArgumentStorageImage: case NonSemanticClspvReflectionArgumentSampledImage: case NonSemanticClspvReflectionArgumentUniformTexelBuffer: case NonSemanticClspvReflectionArgumentStorageTexelBuffer: ASSERT(argSize == sizeof(cl_mem *)); arg.handle = *static_cast(argValue); arg.handleSize = argSize; break; default: // Just store ptr and size (if we end up here) arg.handle = const_cast(argValue); arg.handleSize = argSize; break; } } return angle::Result::Continue; } angle::Result CLKernelVk::createInfo(CLKernelImpl::Info *info) const { info->functionName = mName; info->attributes = mAttributes; info->numArgs = static_cast(mArgs.size()); for (const auto &arg : mArgs) { ArgInfo argInfo; argInfo.name = arg.info.name; argInfo.typeName = arg.info.typeName; argInfo.accessQualifier = arg.info.accessQualifier; argInfo.addressQualifier = arg.info.addressQualifier; argInfo.typeQualifier = arg.info.typeQualifier; info->args.push_back(std::move(argInfo)); } auto &ctx = mKernel.getProgram().getContext(); info->workGroups.resize(ctx.getDevices().size()); const CLProgramVk::DeviceProgramData *deviceProgramData = nullptr; for (auto i = 0u; i < ctx.getDevices().size(); ++i) { auto &workGroup = info->workGroups[i]; const auto deviceVk = &ctx.getDevices()[i]->getImpl(); deviceProgramData = mProgram->getDeviceProgramData(ctx.getDevices()[i]->getNative()); if (deviceProgramData == nullptr) { continue; } // TODO: http://anglebug.com/42267005 ANGLE_TRY( deviceVk->getInfoSizeT(cl::DeviceInfo::MaxWorkGroupSize, &workGroup.workGroupSize)); // TODO: http://anglebug.com/42267004 workGroup.privateMemSize = 0; workGroup.localMemSize = 0; workGroup.prefWorkGroupSizeMultiple = 16u; workGroup.globalWorkSize = {0, 0, 0}; if (deviceProgramData->reflectionData.kernelCompileWorkgroupSize.contains(mName)) { workGroup.compileWorkGroupSize = { deviceProgramData->reflectionData.kernelCompileWorkgroupSize.at(mName)[0], deviceProgramData->reflectionData.kernelCompileWorkgroupSize.at(mName)[1], deviceProgramData->reflectionData.kernelCompileWorkgroupSize.at(mName)[2]}; } else { workGroup.compileWorkGroupSize = {0, 0, 0}; } } return angle::Result::Continue; } angle::Result CLKernelVk::initPipelineLayout() { PipelineLayoutCache *pipelineLayoutCache = mContext->getPipelineLayoutCache(); return pipelineLayoutCache->getPipelineLayout(mContext, mPipelineLayoutDesc, mDescriptorSetLayouts, &mPipelineLayout); } angle::Result CLKernelVk::getOrCreateComputePipeline(vk::PipelineCacheAccess *pipelineCache, const cl::NDRange &ndrange, const cl::Device &device, vk::PipelineHelper **pipelineOut) { const CLProgramVk::DeviceProgramData *devProgramData = getProgram()->getDeviceProgramData(device.getNative()); ASSERT(devProgramData != nullptr); // Populate program specialization constants (if any) uint32_t constantDataOffset = 0; std::vector specConstantData; std::vector mapEntries; for (const auto specConstantUsed : devProgramData->reflectionData.specConstantsUsed) { switch (specConstantUsed) { case SpecConstantType::WorkDimension: specConstantData.push_back(ndrange.workDimensions); break; case SpecConstantType::WorkgroupSizeX: specConstantData.push_back(ndrange.localWorkSize[0]); break; case SpecConstantType::WorkgroupSizeY: specConstantData.push_back(ndrange.localWorkSize[1]); break; case SpecConstantType::WorkgroupSizeZ: specConstantData.push_back(ndrange.localWorkSize[2]); break; case SpecConstantType::GlobalOffsetX: specConstantData.push_back(ndrange.globalWorkOffset[0]); break; case SpecConstantType::GlobalOffsetY: specConstantData.push_back(ndrange.globalWorkOffset[1]); break; case SpecConstantType::GlobalOffsetZ: specConstantData.push_back(ndrange.globalWorkOffset[2]); break; default: UNIMPLEMENTED(); continue; } mapEntries.push_back(VkSpecializationMapEntry{ .constantID = devProgramData->reflectionData.specConstantIDs[specConstantUsed], .offset = constantDataOffset, .size = sizeof(uint32_t)}); constantDataOffset += sizeof(uint32_t); } // Populate kernel specialization constants (if any) for (const auto &specConstant : mSpecConstants) { specConstantData.push_back(specConstant.data); mapEntries.push_back(VkSpecializationMapEntry{ .constantID = specConstant.ID, .offset = constantDataOffset, .size = sizeof(uint32_t)}); constantDataOffset += sizeof(uint32_t); } VkSpecializationInfo computeSpecializationInfo{ .mapEntryCount = static_cast(mapEntries.size()), .pMapEntries = mapEntries.data(), .dataSize = specConstantData.size() * sizeof(uint32_t), .pData = specConstantData.data(), }; // Now get or create (on compute pipeline cache miss) compute pipeline and return it vk::ComputePipelineOptions options = vk::GetComputePipelineOptions( vk::PipelineRobustness::NonRobust, vk::PipelineProtectedAccess::Unprotected); return mShaderProgramHelper.getOrCreateComputePipeline( mContext, &mComputePipelineCache, pipelineCache, getPipelineLayout(), options, PipelineSource::Draw, pipelineOut, mName.c_str(), &computeSpecializationInfo); } bool CLKernelVk::usesPrintf() const { return mProgram->getDeviceProgramData(mName.c_str())->getKernelFlags(mName) & NonSemanticClspvReflectionMayUsePrintf; } angle::Result CLKernelVk::initializeDescriptorPools() { for (DescriptorSetIndex index : angle::AllEnums()) { if (!mDescriptorSetLayoutDescs[index].empty()) { ANGLE_TRY(mContext->getMetaDescriptorPool().bindCachedDescriptorPool( mContext, mDescriptorSetLayoutDescs[index], 1, mContext->getDescriptorSetLayoutCache(), &mDynamicDescriptorPools[index])); } } return angle::Result::Continue; } angle::Result CLKernelVk::allocateDescriptorSet( DescriptorSetIndex index, angle::EnumIterator layoutIndex, vk::OutsideRenderPassCommandBufferHelper *computePassCommands) { if (mDescriptorSets[index] && mDescriptorSets[index]->valid()) { if (mDescriptorSets[index]->usedByCommandBuffer(computePassCommands->getQueueSerial())) { mDescriptorSets[index].reset(); } else { return angle::Result::Continue; } } if (mDynamicDescriptorPools[index]->valid()) { ANGLE_TRY(mDynamicDescriptorPools[index]->allocateDescriptorSet( mContext, *mDescriptorSetLayouts[*layoutIndex], &mDescriptorSets[index])); computePassCommands->retainResource(mDescriptorSets[index].get()); } return angle::Result::Continue; } } // namespace rx