/*------------------------------------------------------------------------ * Vulkan Conformance Tests * ------------------------ * * Copyright (c) 2016 The Khronos Group Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * *//*! * \file vktSparseResourcesBufferMemoryAliasing.cpp * \brief Sparse buffer memory aliasing tests *//*--------------------------------------------------------------------*/ #include "vktSparseResourcesBufferMemoryAliasing.hpp" #include "vktSparseResourcesTestsUtil.hpp" #include "vktSparseResourcesBase.hpp" #include "vktTestCaseUtil.hpp" #include "vkDefs.hpp" #include "vkRef.hpp" #include "vkRefUtil.hpp" #include "vkPlatform.hpp" #include "vkPrograms.hpp" #include "vkRefUtil.hpp" #include "vkMemUtil.hpp" #include "vkBarrierUtil.hpp" #include "vkQueryUtil.hpp" #include "vkBuilderUtil.hpp" #include "vkTypeUtil.hpp" #include "vkCmdUtil.hpp" #include "vkObjUtil.hpp" #include "deStringUtil.hpp" #include "deUniquePtr.hpp" #include #include using namespace vk; namespace vkt { namespace sparse { namespace { enum ShaderParameters { SIZE_OF_UINT_IN_SHADER = 4u, MODULO_DIVISOR = 1024u }; tcu::UVec3 computeWorkGroupSize (const deUint32 numInvocations) { const deUint32 maxComputeWorkGroupInvocations = 128u; const tcu::UVec3 maxComputeWorkGroupSize = tcu::UVec3(128u, 128u, 64u); deUint32 numInvocationsLeft = numInvocations; const deUint32 xWorkGroupSize = std::min(std::min(numInvocationsLeft, maxComputeWorkGroupSize.x()), maxComputeWorkGroupInvocations); numInvocationsLeft = numInvocationsLeft / xWorkGroupSize + ((numInvocationsLeft % xWorkGroupSize) ? 1u : 0u); const deUint32 yWorkGroupSize = std::min(std::min(numInvocationsLeft, maxComputeWorkGroupSize.y()), maxComputeWorkGroupInvocations / xWorkGroupSize); numInvocationsLeft = numInvocationsLeft / yWorkGroupSize + ((numInvocationsLeft % yWorkGroupSize) ? 1u : 0u); const deUint32 zWorkGroupSize = std::min(std::min(numInvocationsLeft, maxComputeWorkGroupSize.z()), maxComputeWorkGroupInvocations / (xWorkGroupSize*yWorkGroupSize)); numInvocationsLeft = numInvocationsLeft / zWorkGroupSize + ((numInvocationsLeft % zWorkGroupSize) ? 1u : 0u); return tcu::UVec3(xWorkGroupSize, yWorkGroupSize, zWorkGroupSize); } class BufferSparseMemoryAliasingCase : public TestCase { public: BufferSparseMemoryAliasingCase (tcu::TestContext& testCtx, const std::string& name, const deUint32 bufferSize, const glu::GLSLVersion glslVersion, const bool useDeviceGroups); void initPrograms (SourceCollections& sourceCollections) const; TestInstance* createInstance (Context& context) const; virtual void checkSupport (Context& context) const; private: const deUint32 m_bufferSizeInBytes; const glu::GLSLVersion m_glslVersion; const bool m_useDeviceGroups; }; BufferSparseMemoryAliasingCase::BufferSparseMemoryAliasingCase (tcu::TestContext& testCtx, const std::string& name, const deUint32 bufferSize, const glu::GLSLVersion glslVersion, const bool useDeviceGroups) : TestCase (testCtx, name) , m_bufferSizeInBytes (bufferSize) , m_glslVersion (glslVersion) , m_useDeviceGroups (useDeviceGroups) { } void BufferSparseMemoryAliasingCase::checkSupport (Context& context) const { context.requireDeviceCoreFeature(DEVICE_CORE_FEATURE_SPARSE_BINDING); context.requireDeviceCoreFeature(DEVICE_CORE_FEATURE_SPARSE_RESIDENCY_ALIASED); } void BufferSparseMemoryAliasingCase::initPrograms (SourceCollections& sourceCollections) const { // Create compute program const char* const versionDecl = glu::getGLSLVersionDeclaration(m_glslVersion); const deUint32 numInvocations = m_bufferSizeInBytes / SIZE_OF_UINT_IN_SHADER; const tcu::UVec3 workGroupSize = computeWorkGroupSize(numInvocations); std::ostringstream src; src << versionDecl << "\n" << "layout (local_size_x = " << workGroupSize.x() << ", local_size_y = " << workGroupSize.y() << ", local_size_z = " << workGroupSize.z() << ") in;\n" << "layout(set = 0, binding = 0, std430) writeonly buffer Output\n" << "{\n" << " uint result[];\n" << "} sb_out;\n" << "\n" << "void main (void)\n" << "{\n" << " uint index = gl_GlobalInvocationID.x + (gl_GlobalInvocationID.y + gl_GlobalInvocationID.z*gl_NumWorkGroups.y*gl_WorkGroupSize.y)*gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n" << " if ( index < " << m_bufferSizeInBytes / SIZE_OF_UINT_IN_SHADER << "u )\n" << " {\n" << " sb_out.result[index] = index % " << MODULO_DIVISOR << "u;\n" << " }\n" << "}\n"; sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str()); } class BufferSparseMemoryAliasingInstance : public SparseResourcesBaseInstance { public: BufferSparseMemoryAliasingInstance (Context& context, const deUint32 bufferSize, const bool useDeviceGroups); tcu::TestStatus iterate (void); private: const deUint32 m_bufferSizeInBytes; const deUint32 m_useDeviceGroups; }; BufferSparseMemoryAliasingInstance::BufferSparseMemoryAliasingInstance (Context& context, const deUint32 bufferSize, const bool useDeviceGroups) : SparseResourcesBaseInstance (context, useDeviceGroups) , m_bufferSizeInBytes (bufferSize) , m_useDeviceGroups (useDeviceGroups) { } tcu::TestStatus BufferSparseMemoryAliasingInstance::iterate (void) { const InstanceInterface& instance = m_context.getInstanceInterface(); { // Create logical device supporting both sparse and compute operations QueueRequirementsVec queueRequirements; queueRequirements.push_back(QueueRequirements(VK_QUEUE_SPARSE_BINDING_BIT, 1u)); queueRequirements.push_back(QueueRequirements(VK_QUEUE_COMPUTE_BIT, 1u)); createDeviceSupportingQueues(queueRequirements); } const vk::VkPhysicalDevice& physicalDevice = getPhysicalDevice(); const DeviceInterface& deviceInterface = getDeviceInterface(); const Queue& sparseQueue = getQueue(VK_QUEUE_SPARSE_BINDING_BIT, 0); const Queue& computeQueue = getQueue(VK_QUEUE_COMPUTE_BIT, 0); // Go through all physical devices for (deUint32 physDevID = 0; physDevID < m_numPhysicalDevices; physDevID++) { const deUint32 firstDeviceID = physDevID; const deUint32 secondDeviceID = (firstDeviceID + 1) % m_numPhysicalDevices; VkBufferCreateInfo bufferCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, // VkStructureType sType; DE_NULL, // const void* pNext; VK_BUFFER_CREATE_SPARSE_BINDING_BIT | VK_BUFFER_CREATE_SPARSE_ALIASED_BIT, // VkBufferCreateFlags flags; m_bufferSizeInBytes, // VkDeviceSize size; VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT, // VkBufferUsageFlags usage; VK_SHARING_MODE_EXCLUSIVE, // VkSharingMode sharingMode; 0u, // deUint32 queueFamilyIndexCount; DE_NULL // const deUint32* pQueueFamilyIndices; }; const deUint32 queueFamilyIndices[] = { sparseQueue.queueFamilyIndex, computeQueue.queueFamilyIndex }; if (sparseQueue.queueFamilyIndex != computeQueue.queueFamilyIndex) { bufferCreateInfo.sharingMode = VK_SHARING_MODE_CONCURRENT; bufferCreateInfo.queueFamilyIndexCount = 2u; bufferCreateInfo.pQueueFamilyIndices = queueFamilyIndices; } // Create sparse buffers const Unique sparseBufferWrite(createBuffer(deviceInterface, getDevice(), &bufferCreateInfo)); const Unique sparseBufferRead(createBuffer(deviceInterface, getDevice(), &bufferCreateInfo)); // Create sparse buffers memory bind semaphore const Unique bufferMemoryBindSemaphore(createSemaphore(deviceInterface, getDevice())); const VkMemoryRequirements bufferMemRequirements = getBufferMemoryRequirements(deviceInterface, getDevice(), *sparseBufferWrite); if (bufferMemRequirements.size > getPhysicalDeviceProperties(instance, physicalDevice).limits.sparseAddressSpaceSize) TCU_THROW(NotSupportedError, "Required memory size for sparse resources exceeds device limits"); DE_ASSERT((bufferMemRequirements.size % bufferMemRequirements.alignment) == 0); const deUint32 memoryType = findMatchingMemoryType(instance, getPhysicalDevice(secondDeviceID), bufferMemRequirements, MemoryRequirement::Any); if (memoryType == NO_MATCH_FOUND) return tcu::TestStatus::fail("No matching memory type found"); if (firstDeviceID != secondDeviceID) { VkPeerMemoryFeatureFlags peerMemoryFeatureFlags = (VkPeerMemoryFeatureFlags)0; const deUint32 heapIndex = getHeapIndexForMemoryType(instance, getPhysicalDevice(secondDeviceID), memoryType); deviceInterface.getDeviceGroupPeerMemoryFeatures(getDevice(), heapIndex, firstDeviceID, secondDeviceID, &peerMemoryFeatureFlags); if (((peerMemoryFeatureFlags & VK_PEER_MEMORY_FEATURE_COPY_SRC_BIT) == 0) || ((peerMemoryFeatureFlags & VK_PEER_MEMORY_FEATURE_GENERIC_DST_BIT) == 0)) { TCU_THROW(NotSupportedError, "Peer memory does not support COPY_SRC and GENERIC_DST"); } } const VkSparseMemoryBind sparseMemoryBind = makeSparseMemoryBind(deviceInterface, getDevice(), bufferMemRequirements.size, memoryType, 0u); Move deviceMemoryPtr(check(sparseMemoryBind.memory), Deleter(deviceInterface, getDevice(), DE_NULL)); { const VkSparseBufferMemoryBindInfo sparseBufferMemoryBindInfo[2] = { makeSparseBufferMemoryBindInfo (*sparseBufferWrite, //VkBuffer buffer; 1u, //deUint32 bindCount; &sparseMemoryBind //const VkSparseMemoryBind* Binds; ), makeSparseBufferMemoryBindInfo (*sparseBufferRead, //VkBuffer buffer; 1u, //deUint32 bindCount; &sparseMemoryBind //const VkSparseMemoryBind* Binds; ) }; const VkDeviceGroupBindSparseInfo devGroupBindSparseInfo = { VK_STRUCTURE_TYPE_DEVICE_GROUP_BIND_SPARSE_INFO, //VkStructureType sType; DE_NULL, //const void* pNext; firstDeviceID, //deUint32 resourceDeviceIndex; secondDeviceID, //deUint32 memoryDeviceIndex; }; const VkBindSparseInfo bindSparseInfo = { VK_STRUCTURE_TYPE_BIND_SPARSE_INFO, //VkStructureType sType; m_useDeviceGroups ? &devGroupBindSparseInfo : DE_NULL, //const void* pNext; 0u, //deUint32 waitSemaphoreCount; DE_NULL, //const VkSemaphore* pWaitSemaphores; 2u, //deUint32 bufferBindCount; sparseBufferMemoryBindInfo, //const VkSparseBufferMemoryBindInfo* pBufferBinds; 0u, //deUint32 imageOpaqueBindCount; DE_NULL, //const VkSparseImageOpaqueMemoryBindInfo* pImageOpaqueBinds; 0u, //deUint32 imageBindCount; DE_NULL, //const VkSparseImageMemoryBindInfo* pImageBinds; 1u, //deUint32 signalSemaphoreCount; &bufferMemoryBindSemaphore.get() //const VkSemaphore* pSignalSemaphores; }; // Submit sparse bind commands for execution VK_CHECK(deviceInterface.queueBindSparse(sparseQueue.queueHandle, 1u, &bindSparseInfo, DE_NULL)); } // Create output buffer const VkBufferCreateInfo outputBufferCreateInfo = makeBufferCreateInfo(m_bufferSizeInBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT); const Unique outputBuffer(createBuffer(deviceInterface, getDevice(), &outputBufferCreateInfo)); const de::UniquePtr outputBufferAlloc(bindBuffer(deviceInterface, getDevice(), getAllocator(), *outputBuffer, MemoryRequirement::HostVisible)); // Create command buffer for compute and data transfer operations const Unique commandPool(makeCommandPool(deviceInterface, getDevice(), computeQueue.queueFamilyIndex)); const Unique commandBuffer(allocateCommandBuffer(deviceInterface, getDevice(), *commandPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY)); // Start recording commands beginCommandBuffer(deviceInterface, *commandBuffer); // Create descriptor set const Unique descriptorSetLayout( DescriptorSetLayoutBuilder() .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT) .build(deviceInterface, getDevice())); // Create compute pipeline const Unique shaderModule(createShaderModule(deviceInterface, getDevice(), m_context.getBinaryCollection().get("comp"), DE_NULL)); const Unique pipelineLayout(makePipelineLayout(deviceInterface, getDevice(), *descriptorSetLayout)); const Unique computePipeline(makeComputePipeline(deviceInterface, getDevice(), *pipelineLayout, *shaderModule)); deviceInterface.cmdBindPipeline(*commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *computePipeline); // Create descriptor set const Unique descriptorPool( DescriptorPoolBuilder() .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1u) .build(deviceInterface, getDevice(), VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u)); const Unique descriptorSet(makeDescriptorSet(deviceInterface, getDevice(), *descriptorPool, *descriptorSetLayout)); { const VkDescriptorBufferInfo sparseBufferInfo = makeDescriptorBufferInfo(*sparseBufferWrite, 0u, m_bufferSizeInBytes); DescriptorSetUpdateBuilder() .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &sparseBufferInfo) .update(deviceInterface, getDevice()); } deviceInterface.cmdBindDescriptorSets(*commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL); { deUint32 numInvocationsLeft = m_bufferSizeInBytes / SIZE_OF_UINT_IN_SHADER; const tcu::UVec3 workGroupSize = computeWorkGroupSize(numInvocationsLeft); const tcu::UVec3 maxComputeWorkGroupCount = tcu::UVec3(65535u, 65535u, 65535u); numInvocationsLeft -= workGroupSize.x()*workGroupSize.y()*workGroupSize.z(); const deUint32 xWorkGroupCount = std::min(numInvocationsLeft, maxComputeWorkGroupCount.x()); numInvocationsLeft = numInvocationsLeft / xWorkGroupCount + ((numInvocationsLeft % xWorkGroupCount) ? 1u : 0u); const deUint32 yWorkGroupCount = std::min(numInvocationsLeft, maxComputeWorkGroupCount.y()); numInvocationsLeft = numInvocationsLeft / yWorkGroupCount + ((numInvocationsLeft % yWorkGroupCount) ? 1u : 0u); const deUint32 zWorkGroupCount = std::min(numInvocationsLeft, maxComputeWorkGroupCount.z()); numInvocationsLeft = numInvocationsLeft / zWorkGroupCount + ((numInvocationsLeft % zWorkGroupCount) ? 1u : 0u); if (numInvocationsLeft != 1u) TCU_THROW(NotSupportedError, "Buffer size is not supported"); deviceInterface.cmdDispatch(*commandBuffer, xWorkGroupCount, yWorkGroupCount, zWorkGroupCount); } { const VkBufferMemoryBarrier sparseBufferWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT, *sparseBufferWrite, 0ull, m_bufferSizeInBytes); deviceInterface.cmdPipelineBarrier(*commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0u, 0u, DE_NULL, 1u, &sparseBufferWriteBarrier, 0u, DE_NULL); } { const VkBufferCopy bufferCopy = makeBufferCopy(0u, 0u, m_bufferSizeInBytes); deviceInterface.cmdCopyBuffer(*commandBuffer, *sparseBufferRead, *outputBuffer, 1u, &bufferCopy); } { const VkBufferMemoryBarrier outputBufferHostBarrier = makeBufferMemoryBarrier(VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, m_bufferSizeInBytes); deviceInterface.cmdPipelineBarrier(*commandBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0u, 0u, DE_NULL, 1u, &outputBufferHostBarrier, 0u, DE_NULL); } // End recording commands endCommandBuffer(deviceInterface, *commandBuffer); // The stage at which execution is going to wait for finish of sparse binding operations const VkPipelineStageFlags waitStageBits[] = { VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT }; // Submit commands for execution and wait for completion // In case of device groups, submit on the physical device with the resource submitCommandsAndWait(deviceInterface, getDevice(), computeQueue.queueHandle, *commandBuffer, 1u, &bufferMemoryBindSemaphore.get(), waitStageBits, 0, DE_NULL, m_useDeviceGroups, firstDeviceID); // Retrieve data from output buffer to host memory invalidateAlloc(deviceInterface, getDevice(), *outputBufferAlloc); const deUint8* outputData = static_cast(outputBufferAlloc->getHostPtr()); // Wait for sparse queue to become idle deviceInterface.queueWaitIdle(sparseQueue.queueHandle); // Prepare reference data std::vector referenceData; referenceData.resize(m_bufferSizeInBytes); std::vector referenceDataBlock; referenceDataBlock.resize(MODULO_DIVISOR); for (deUint32 valueNdx = 0; valueNdx < MODULO_DIVISOR; ++valueNdx) { referenceDataBlock[valueNdx] = valueNdx % MODULO_DIVISOR; } const deUint32 fullBlockSizeInBytes = MODULO_DIVISOR * SIZE_OF_UINT_IN_SHADER; const deUint32 lastBlockSizeInBytes = m_bufferSizeInBytes % fullBlockSizeInBytes; const deUint32 numberOfBlocks = m_bufferSizeInBytes / fullBlockSizeInBytes + (lastBlockSizeInBytes ? 1u : 0u); for (deUint32 blockNdx = 0; blockNdx < numberOfBlocks; ++blockNdx) { const deUint32 offset = blockNdx * fullBlockSizeInBytes; deMemcpy(&referenceData[0] + offset, &referenceDataBlock[0], ((offset + fullBlockSizeInBytes) <= m_bufferSizeInBytes) ? fullBlockSizeInBytes : lastBlockSizeInBytes); } // Compare reference data with output data if (deMemCmp(&referenceData[0], outputData, m_bufferSizeInBytes) != 0) return tcu::TestStatus::fail("Failed"); } return tcu::TestStatus::pass("Passed"); } TestInstance* BufferSparseMemoryAliasingCase::createInstance (Context& context) const { return new BufferSparseMemoryAliasingInstance(context, m_bufferSizeInBytes, m_useDeviceGroups); } } // anonymous ns void addBufferSparseMemoryAliasingTests(tcu::TestCaseGroup* group, const bool useDeviceGroups) { group->addChild(new BufferSparseMemoryAliasingCase(group->getTestContext(), "buffer_size_2_10", 1 << 10, glu::GLSL_VERSION_440, useDeviceGroups)); group->addChild(new BufferSparseMemoryAliasingCase(group->getTestContext(), "buffer_size_2_12", 1 << 12, glu::GLSL_VERSION_440, useDeviceGroups)); group->addChild(new BufferSparseMemoryAliasingCase(group->getTestContext(), "buffer_size_2_16", 1 << 16, glu::GLSL_VERSION_440, useDeviceGroups)); group->addChild(new BufferSparseMemoryAliasingCase(group->getTestContext(), "buffer_size_2_17", 1 << 17, glu::GLSL_VERSION_440, useDeviceGroups)); group->addChild(new BufferSparseMemoryAliasingCase(group->getTestContext(), "buffer_size_2_20", 1 << 20, glu::GLSL_VERSION_440, useDeviceGroups)); group->addChild(new BufferSparseMemoryAliasingCase(group->getTestContext(), "buffer_size_2_24", 1 << 24, glu::GLSL_VERSION_440, useDeviceGroups)); } } // sparse } // vkt