• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*------------------------------------------------------------------------
2  * Vulkan Conformance Tests
3  * ------------------------
4  *
5  * Copyright (c) 2019 The Khronos Group Inc.
6  * Copyright (c) 2019 The Android Open Source Project
7  * Copyright (c) 2023 LunarG, Inc.
8  * Copyright (c) 2023 Nintendo
9  *
10  * Licensed under the Apache License, Version 2.0 (the "License");
11  * you may not use this file except in compliance with the License.
12  * You may obtain a copy of the License at
13  *
14  *      http://www.apache.org/licenses/LICENSE-2.0
15  *
16  * Unless required by applicable law or agreed to in writing, software
17  * distributed under the License is distributed on an "AS IS" BASIS,
18  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19  * See the License for the specific language governing permissions and
20  * limitations under the License.
21  *
22  *//*!
23  * \file
24  * \brief Compute Shader Tests
25  *//*--------------------------------------------------------------------*/
26 
27 #include "vktComputeBasicComputeShaderTests.hpp"
28 #include "vktTestCase.hpp"
29 #include "vktTestCaseUtil.hpp"
30 #include "vktComputeTestsUtil.hpp"
31 #include "vktCustomInstancesDevices.hpp"
32 #include "vktAmberTestCase.hpp"
33 
34 #include "vkDefs.hpp"
35 #include "vkRef.hpp"
36 #include "vkRefUtil.hpp"
37 #include "vkPlatform.hpp"
38 #include "vkPrograms.hpp"
39 #include "vkRefUtil.hpp"
40 #include "vkMemUtil.hpp"
41 #include "vkBarrierUtil.hpp"
42 #include "vkQueryUtil.hpp"
43 #include "vkBuilderUtil.hpp"
44 #include "vkTypeUtil.hpp"
45 #include "vkDeviceUtil.hpp"
46 #include "vkCmdUtil.hpp"
47 #include "vkObjUtil.hpp"
48 #include "vkBufferWithMemory.hpp"
49 #include "vkSafetyCriticalUtil.hpp"
50 #include "vkImageWithMemory.hpp"
51 
52 #include "tcuCommandLine.hpp"
53 #include "tcuTestLog.hpp"
54 #include "tcuMaybe.hpp"
55 
56 #include "deStringUtil.hpp"
57 #include "deUniquePtr.hpp"
58 #include "deRandom.hpp"
59 
60 #include <vector>
61 #include <memory>
62 
63 using namespace vk;
64 
65 namespace vkt
66 {
67 namespace compute
68 {
69 namespace
70 {
71 
72 template<typename T, int size>
multiplyComponents(const tcu::Vector<T,size> & v)73 T multiplyComponents (const tcu::Vector<T, size>& v)
74 {
75 	T accum = 1;
76 	for (int i = 0; i < size; ++i)
77 		accum *= v[i];
78 	return accum;
79 }
80 
81 template<typename T>
squared(const T & a)82 inline T squared (const T& a)
83 {
84 	return a * a;
85 }
86 
make2DImageCreateInfo(const tcu::IVec2 & imageSize,const VkImageUsageFlags usage)87 inline VkImageCreateInfo make2DImageCreateInfo (const tcu::IVec2& imageSize, const VkImageUsageFlags usage)
88 {
89 	const VkImageCreateInfo imageParams =
90 	{
91 		VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,				// VkStructureType			sType;
92 		DE_NULL,											// const void*				pNext;
93 		0u,													// VkImageCreateFlags		flags;
94 		VK_IMAGE_TYPE_2D,									// VkImageType				imageType;
95 		VK_FORMAT_R32_UINT,									// VkFormat					format;
96 		vk::makeExtent3D(imageSize.x(), imageSize.y(), 1),	// VkExtent3D				extent;
97 		1u,													// deUint32					mipLevels;
98 		1u,													// deUint32					arrayLayers;
99 		VK_SAMPLE_COUNT_1_BIT,								// VkSampleCountFlagBits	samples;
100 		VK_IMAGE_TILING_OPTIMAL,							// VkImageTiling			tiling;
101 		usage,												// VkImageUsageFlags		usage;
102 		VK_SHARING_MODE_EXCLUSIVE,							// VkSharingMode			sharingMode;
103 		0u,													// deUint32					queueFamilyIndexCount;
104 		DE_NULL,											// const deUint32*			pQueueFamilyIndices;
105 		VK_IMAGE_LAYOUT_UNDEFINED,							// VkImageLayout			initialLayout;
106 	};
107 	return imageParams;
108 }
109 
makeBufferImageCopy(const tcu::IVec2 & imageSize)110 inline VkBufferImageCopy makeBufferImageCopy(const tcu::IVec2& imageSize)
111 {
112 	return compute::makeBufferImageCopy(vk::makeExtent3D(imageSize.x(), imageSize.y(), 1), 1u);
113 }
114 
115 enum BufferType
116 {
117 	BUFFER_TYPE_UNIFORM,
118 	BUFFER_TYPE_SSBO,
119 };
120 
121 class SharedVarTest : public vkt::TestCase
122 {
123 public:
124 						SharedVarTest	(tcu::TestContext&		testCtx,
125 										 const std::string&		name,
126 										 const tcu::IVec3&		localSize,
127 										 const tcu::IVec3&		workSize,
128 										 const vk::ComputePipelineConstructionType computePipelineConstructionType);
129 
130 	virtual void		checkSupport	(Context& context) const;
131 	void				initPrograms	(SourceCollections&		sourceCollections) const;
132 	TestInstance*		createInstance	(Context&				context) const;
133 
134 private:
135 	const tcu::IVec3					m_localSize;
136 	const tcu::IVec3					m_workSize;
137 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
138 };
139 
140 class SharedVarTestInstance : public vkt::TestInstance
141 {
142 public:
143 									SharedVarTestInstance	(Context&			context,
144 															 const tcu::IVec3&	localSize,
145 															 const tcu::IVec3&	workSize,
146 															 const vk::ComputePipelineConstructionType computePipelineConstructionType);
147 
148 	tcu::TestStatus					iterate					(void);
149 
150 private:
151 	const tcu::IVec3					m_localSize;
152 	const tcu::IVec3					m_workSize;
153 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
154 };
155 
SharedVarTest(tcu::TestContext & testCtx,const std::string & name,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)156 SharedVarTest::SharedVarTest (tcu::TestContext&		testCtx,
157 							  const std::string&	name,
158 							  const tcu::IVec3&		localSize,
159 							  const tcu::IVec3&		workSize,
160 							  const vk::ComputePipelineConstructionType computePipelineConstructionType)
161 	: TestCase		(testCtx, name)
162 	, m_localSize	(localSize)
163 	, m_workSize	(workSize)
164 	, m_computePipelineConstructionType(computePipelineConstructionType)
165 {
166 }
167 
checkSupport(Context & context) const168 void SharedVarTest::checkSupport (Context& context) const
169 {
170 	checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(), m_computePipelineConstructionType);
171 }
172 
initPrograms(SourceCollections & sourceCollections) const173 void SharedVarTest::initPrograms (SourceCollections& sourceCollections) const
174 {
175 	const int workGroupSize = multiplyComponents(m_localSize);
176 	const int workGroupCount = multiplyComponents(m_workSize);
177 	const int numValues = workGroupSize * workGroupCount;
178 
179 	std::ostringstream src;
180 	src << "#version 310 es\n"
181 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
182 		<< "layout(binding = 0) writeonly buffer Output {\n"
183 		<< "    uint values[" << numValues << "];\n"
184 		<< "} sb_out;\n\n"
185 		<< "shared uint offsets[" << workGroupSize << "];\n\n"
186 		<< "void main (void) {\n"
187 		<< "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
188 		<< "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
189 		<< "    uint globalOffs = localSize*globalNdx;\n"
190 		<< "    uint localOffs  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_LocalInvocationID.z + gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x;\n"
191 		<< "\n"
192 		<< "    offsets[localSize-localOffs-1u] = globalOffs + localOffs*localOffs;\n"
193 		<< "    memoryBarrierShared();\n"
194 		<< "    barrier();\n"
195 		<< "    sb_out.values[globalOffs + localOffs] = offsets[localOffs];\n"
196 		<< "}\n";
197 
198 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
199 }
200 
createInstance(Context & context) const201 TestInstance* SharedVarTest::createInstance (Context& context) const
202 {
203 	return new SharedVarTestInstance(context, m_localSize, m_workSize, m_computePipelineConstructionType);
204 }
205 
SharedVarTestInstance(Context & context,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)206 SharedVarTestInstance::SharedVarTestInstance (Context& context, const tcu::IVec3& localSize, const tcu::IVec3& workSize, const vk::ComputePipelineConstructionType computePipelineConstructionType)
207 	: TestInstance						(context)
208 	, m_localSize						(localSize)
209 	, m_workSize						(workSize)
210 	, m_computePipelineConstructionType	(computePipelineConstructionType)
211 {
212 }
213 
iterate(void)214 tcu::TestStatus SharedVarTestInstance::iterate (void)
215 {
216 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
217 	const VkDevice			device				= m_context.getDevice();
218 	const VkQueue			queue				= m_context.getUniversalQueue();
219 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
220 	Allocator&				allocator			= m_context.getDefaultAllocator();
221 
222 	const int workGroupSize = multiplyComponents(m_localSize);
223 	const int workGroupCount = multiplyComponents(m_workSize);
224 
225 	// Create a buffer and host-visible memory for it
226 
227 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * workGroupSize * workGroupCount;
228 	const BufferWithMemory buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
229 
230 	// Create descriptor set
231 
232 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
233 		DescriptorSetLayoutBuilder()
234 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
235 		.build(vk, device));
236 
237 	const Unique<VkDescriptorPool> descriptorPool(
238 		DescriptorPoolBuilder()
239 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
240 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
241 
242 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
243 
244 	const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
245 	DescriptorSetUpdateBuilder()
246 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
247 		.update(vk, device);
248 
249 	// Perform the computation
250 
251 	ComputePipelineWrapper			pipeline(vk, device, m_computePipelineConstructionType, m_context.getBinaryCollection().get("comp"));
252 	pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
253 	pipeline.buildPipeline();
254 
255 	const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
256 
257 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
258 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
259 
260 	// Start recording commands
261 
262 	beginCommandBuffer(vk, *cmdBuffer);
263 
264 	pipeline.bind(*cmdBuffer);
265 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
266 
267 	vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
268 
269 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
270 
271 	endCommandBuffer(vk, *cmdBuffer);
272 
273 	// Wait for completion
274 
275 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
276 
277 	// Validate the results
278 
279 	const Allocation& bufferAllocation = buffer.getAllocation();
280 	invalidateAlloc(vk, device, bufferAllocation);
281 
282 	const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
283 
284 	for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
285 	{
286 		const int globalOffset = groupNdx * workGroupSize;
287 		for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
288 		{
289 			const deUint32 res = bufferPtr[globalOffset + localOffset];
290 			const deUint32 ref = globalOffset + squared(workGroupSize - localOffset - 1);
291 
292 			if (res != ref)
293 			{
294 				std::ostringstream msg;
295 				msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
296 				return tcu::TestStatus::fail(msg.str());
297 			}
298 		}
299 	}
300 	return tcu::TestStatus::pass("Compute succeeded");
301 }
302 
303 class SharedVarAtomicOpTest : public vkt::TestCase
304 {
305 public:
306 						SharedVarAtomicOpTest	(tcu::TestContext&	testCtx,
307 												 const std::string&	name,
308 												 const tcu::IVec3&	localSize,
309 												 const tcu::IVec3&	workSize,
310 												 const vk::ComputePipelineConstructionType computePipelineConstructionType);
311 
312 	virtual void		checkSupport			(Context& context) const;
313 	void				initPrograms			(SourceCollections& sourceCollections) const;
314 	TestInstance*		createInstance			(Context&			context) const;
315 
316 private:
317 	const tcu::IVec3					m_localSize;
318 	const tcu::IVec3					m_workSize;
319 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
320 };
321 
322 class SharedVarAtomicOpTestInstance : public vkt::TestInstance
323 {
324 public:
325 									SharedVarAtomicOpTestInstance	(Context&			context,
326 																	 const tcu::IVec3&	localSize,
327 																	 const tcu::IVec3&	workSize,
328 																	 const vk::ComputePipelineConstructionType computePipelineConstructionType);
329 
330 	tcu::TestStatus					iterate							(void);
331 
332 private:
333 	const tcu::IVec3					m_localSize;
334 	const tcu::IVec3					m_workSize;
335 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
336 };
337 
SharedVarAtomicOpTest(tcu::TestContext & testCtx,const std::string & name,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)338 SharedVarAtomicOpTest::SharedVarAtomicOpTest (tcu::TestContext&		testCtx,
339 											  const std::string&	name,
340 											  const tcu::IVec3&		localSize,
341 											  const tcu::IVec3&		workSize,
342 											  const vk::ComputePipelineConstructionType computePipelineConstructionType)
343 	: TestCase							(testCtx, name)
344 	, m_localSize						(localSize)
345 	, m_workSize						(workSize)
346 	, m_computePipelineConstructionType	(computePipelineConstructionType)
347 {
348 }
349 
checkSupport(Context & context) const350 void SharedVarAtomicOpTest::checkSupport (Context& context) const
351 {
352 	checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(), m_computePipelineConstructionType);
353 }
354 
initPrograms(SourceCollections & sourceCollections) const355 void SharedVarAtomicOpTest::initPrograms (SourceCollections& sourceCollections) const
356 {
357 	const int workGroupSize = multiplyComponents(m_localSize);
358 	const int workGroupCount = multiplyComponents(m_workSize);
359 	const int numValues = workGroupSize * workGroupCount;
360 
361 	std::ostringstream src;
362 	src << "#version 310 es\n"
363 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
364 		<< "layout(binding = 0) writeonly buffer Output {\n"
365 		<< "    uint values[" << numValues << "];\n"
366 		<< "} sb_out;\n\n"
367 		<< "shared uint count;\n\n"
368 		<< "void main (void) {\n"
369 		<< "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
370 		<< "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
371 		<< "    uint globalOffs = localSize*globalNdx;\n"
372 		<< "\n"
373 		<< "    count = 0u;\n"
374 		<< "    memoryBarrierShared();\n"
375 		<< "    barrier();\n"
376 		<< "    uint oldVal = atomicAdd(count, 1u);\n"
377 		<< "    sb_out.values[globalOffs+oldVal] = oldVal+1u;\n"
378 		<< "}\n";
379 
380 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
381 }
382 
createInstance(Context & context) const383 TestInstance* SharedVarAtomicOpTest::createInstance (Context& context) const
384 {
385 	return new SharedVarAtomicOpTestInstance(context, m_localSize, m_workSize, m_computePipelineConstructionType);
386 }
387 
SharedVarAtomicOpTestInstance(Context & context,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)388 SharedVarAtomicOpTestInstance::SharedVarAtomicOpTestInstance (Context& context, const tcu::IVec3& localSize, const tcu::IVec3& workSize, const vk::ComputePipelineConstructionType computePipelineConstructionType)
389 	: TestInstance						(context)
390 	, m_localSize						(localSize)
391 	, m_workSize						(workSize)
392 	, m_computePipelineConstructionType	(computePipelineConstructionType)
393 {
394 }
395 
iterate(void)396 tcu::TestStatus SharedVarAtomicOpTestInstance::iterate (void)
397 {
398 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
399 	const VkDevice			device				= m_context.getDevice();
400 	const VkQueue			queue				= m_context.getUniversalQueue();
401 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
402 	Allocator&				allocator			= m_context.getDefaultAllocator();
403 
404 	const int workGroupSize = multiplyComponents(m_localSize);
405 	const int workGroupCount = multiplyComponents(m_workSize);
406 
407 	// Create a buffer and host-visible memory for it
408 
409 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * workGroupSize * workGroupCount;
410 	const BufferWithMemory buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
411 
412 	// Create descriptor set
413 
414 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
415 		DescriptorSetLayoutBuilder()
416 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
417 		.build(vk, device));
418 
419 	const Unique<VkDescriptorPool> descriptorPool(
420 		DescriptorPoolBuilder()
421 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
422 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
423 
424 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
425 
426 	const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
427 	DescriptorSetUpdateBuilder()
428 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
429 		.update(vk, device);
430 
431 	// Perform the computation
432 
433 	ComputePipelineWrapper			pipeline(vk, device, m_computePipelineConstructionType, m_context.getBinaryCollection().get("comp"));
434 	pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
435 	pipeline.buildPipeline();
436 
437 	const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
438 
439 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
440 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
441 
442 	// Start recording commands
443 
444 	beginCommandBuffer(vk, *cmdBuffer);
445 
446 	pipeline.bind(*cmdBuffer);
447 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
448 
449 	vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
450 
451 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1u, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
452 
453 	endCommandBuffer(vk, *cmdBuffer);
454 
455 	// Wait for completion
456 
457 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
458 
459 	// Validate the results
460 
461 	const Allocation& bufferAllocation = buffer.getAllocation();
462 	invalidateAlloc(vk, device, bufferAllocation);
463 
464 	const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
465 
466 	for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
467 	{
468 		const int globalOffset = groupNdx * workGroupSize;
469 		for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
470 		{
471 			const deUint32 res = bufferPtr[globalOffset + localOffset];
472 			const deUint32 ref = localOffset + 1;
473 
474 			if (res != ref)
475 			{
476 				std::ostringstream msg;
477 				msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
478 				return tcu::TestStatus::fail(msg.str());
479 			}
480 		}
481 	}
482 	return tcu::TestStatus::pass("Compute succeeded");
483 }
484 
485 class SSBOLocalBarrierTest : public vkt::TestCase
486 {
487 public:
488 						SSBOLocalBarrierTest	(tcu::TestContext&	testCtx,
489 												 const std::string& name,
490 												 const tcu::IVec3&	localSize,
491 												 const tcu::IVec3&	workSize,
492 												 const vk::ComputePipelineConstructionType computePipelineConstructionType
493 						);
494 
495 	virtual void		checkSupport			(Context& context) const;
496 	void				initPrograms			(SourceCollections& sourceCollections) const;
497 	TestInstance*		createInstance			(Context&			context) const;
498 
499 private:
500 	const tcu::IVec3	m_localSize;
501 	const tcu::IVec3	m_workSize;
502 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
503 };
504 
505 class SSBOLocalBarrierTestInstance : public vkt::TestInstance
506 {
507 public:
508 									SSBOLocalBarrierTestInstance	(Context&			context,
509 																	 const tcu::IVec3&	localSize,
510 																	 const tcu::IVec3&	workSize,
511 																	 const vk::ComputePipelineConstructionType computePipelineConstructionType);
512 
513 	tcu::TestStatus					iterate							(void);
514 
515 private:
516 	const tcu::IVec3					m_localSize;
517 	const tcu::IVec3					m_workSize;
518 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
519 };
520 
SSBOLocalBarrierTest(tcu::TestContext & testCtx,const std::string & name,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)521 SSBOLocalBarrierTest::SSBOLocalBarrierTest (tcu::TestContext&	testCtx,
522 											const std::string&	name,
523 											const tcu::IVec3&	localSize,
524 											const tcu::IVec3&	workSize,
525 											const vk::ComputePipelineConstructionType computePipelineConstructionType)
526 	: TestCase		(testCtx, name)
527 	, m_localSize	(localSize)
528 	, m_workSize	(workSize)
529 	, m_computePipelineConstructionType(computePipelineConstructionType)
530 {
531 }
532 
checkSupport(Context & context) const533 void SSBOLocalBarrierTest::checkSupport (Context& context) const
534 {
535 	checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(), m_computePipelineConstructionType);
536 }
537 
initPrograms(SourceCollections & sourceCollections) const538 void SSBOLocalBarrierTest::initPrograms (SourceCollections& sourceCollections) const
539 {
540 	const int workGroupSize = multiplyComponents(m_localSize);
541 	const int workGroupCount = multiplyComponents(m_workSize);
542 	const int numValues = workGroupSize * workGroupCount;
543 
544 	std::ostringstream src;
545 	src << "#version 310 es\n"
546 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
547 		<< "layout(binding = 0) coherent buffer Output {\n"
548 		<< "    uint values[" << numValues << "];\n"
549 		<< "} sb_out;\n\n"
550 		<< "void main (void) {\n"
551 		<< "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
552 		<< "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
553 		<< "    uint globalOffs = localSize*globalNdx;\n"
554 		<< "    uint localOffs  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_LocalInvocationID.z + gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x;\n"
555 		<< "\n"
556 		<< "    sb_out.values[globalOffs + localOffs] = globalOffs;\n"
557 		<< "    memoryBarrierBuffer();\n"
558 		<< "    barrier();\n"
559 		<< "    sb_out.values[globalOffs + ((localOffs+1u)%localSize)] += localOffs;\n"		// += so we read and write
560 		<< "    memoryBarrierBuffer();\n"
561 		<< "    barrier();\n"
562 		<< "    sb_out.values[globalOffs + ((localOffs+2u)%localSize)] += localOffs;\n"
563 		<< "}\n";
564 
565 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
566 }
567 
createInstance(Context & context) const568 TestInstance* SSBOLocalBarrierTest::createInstance (Context& context) const
569 {
570 	return new SSBOLocalBarrierTestInstance(context, m_localSize, m_workSize, m_computePipelineConstructionType);
571 }
572 
SSBOLocalBarrierTestInstance(Context & context,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)573 SSBOLocalBarrierTestInstance::SSBOLocalBarrierTestInstance (Context& context, const tcu::IVec3& localSize, const tcu::IVec3& workSize, const vk::ComputePipelineConstructionType computePipelineConstructionType)
574 	: TestInstance	(context)
575 	, m_localSize	(localSize)
576 	, m_workSize	(workSize)
577 	, m_computePipelineConstructionType(computePipelineConstructionType)
578 {
579 }
580 
iterate(void)581 tcu::TestStatus SSBOLocalBarrierTestInstance::iterate (void)
582 {
583 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
584 	const VkDevice			device				= m_context.getDevice();
585 	const VkQueue			queue				= m_context.getUniversalQueue();
586 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
587 	Allocator&				allocator			= m_context.getDefaultAllocator();
588 
589 	const int workGroupSize = multiplyComponents(m_localSize);
590 	const int workGroupCount = multiplyComponents(m_workSize);
591 
592 	// Create a buffer and host-visible memory for it
593 
594 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * workGroupSize * workGroupCount;
595 	const BufferWithMemory buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
596 
597 	// Create descriptor set
598 
599 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
600 		DescriptorSetLayoutBuilder()
601 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
602 		.build(vk, device));
603 
604 	const Unique<VkDescriptorPool> descriptorPool(
605 		DescriptorPoolBuilder()
606 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
607 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
608 
609 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
610 
611 	const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
612 	DescriptorSetUpdateBuilder()
613 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
614 		.update(vk, device);
615 
616 	// Perform the computation
617 
618 	ComputePipelineWrapper			pipeline(vk, device, m_computePipelineConstructionType, m_context.getBinaryCollection().get("comp"));
619 	pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
620 	pipeline.buildPipeline();
621 
622 	const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
623 
624 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
625 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
626 
627 	// Start recording commands
628 
629 	beginCommandBuffer(vk, *cmdBuffer);
630 
631 	pipeline.bind(*cmdBuffer);
632 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
633 
634 	vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
635 
636 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
637 
638 	endCommandBuffer(vk, *cmdBuffer);
639 
640 	// Wait for completion
641 
642 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
643 
644 	// Validate the results
645 
646 	const Allocation& bufferAllocation = buffer.getAllocation();
647 	invalidateAlloc(vk, device, bufferAllocation);
648 
649 	const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
650 
651 	for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
652 	{
653 		const int globalOffset = groupNdx * workGroupSize;
654 		for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
655 		{
656 			const deUint32	res		= bufferPtr[globalOffset + localOffset];
657 			const int		offs0	= localOffset - 1 < 0 ? ((localOffset + workGroupSize - 1) % workGroupSize) : ((localOffset - 1) % workGroupSize);
658 			const int		offs1	= localOffset - 2 < 0 ? ((localOffset + workGroupSize - 2) % workGroupSize) : ((localOffset - 2) % workGroupSize);
659 			const deUint32	ref		= static_cast<deUint32>(globalOffset + offs0 + offs1);
660 
661 			if (res != ref)
662 			{
663 				std::ostringstream msg;
664 				msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
665 				return tcu::TestStatus::fail(msg.str());
666 			}
667 		}
668 	}
669 	return tcu::TestStatus::pass("Compute succeeded");
670 }
671 
672 class CopyImageToSSBOTest : public vkt::TestCase
673 {
674 public:
675 						CopyImageToSSBOTest		(tcu::TestContext&	testCtx,
676 												 const std::string&	name,
677 												 const tcu::IVec2&	localSize,
678 												 const tcu::IVec2&	imageSize,
679 												 const vk::ComputePipelineConstructionType computePipelineConstructionType);
680 
681 	virtual void		checkSupport			(Context& context) const;
682 	void				initPrograms			(SourceCollections& sourceCollections) const;
683 	TestInstance*		createInstance			(Context&			context) const;
684 
685 private:
686 	const tcu::IVec2					m_localSize;
687 	const tcu::IVec2					m_imageSize;
688 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
689 };
690 
691 class CopyImageToSSBOTestInstance : public vkt::TestInstance
692 {
693 public:
694 									CopyImageToSSBOTestInstance		(Context&			context,
695 																	 const tcu::IVec2&	localSize,
696 																	 const tcu::IVec2&	imageSize,
697 																	 const vk::ComputePipelineConstructionType computePipelineConstructionType);
698 
699 	tcu::TestStatus					iterate							(void);
700 
701 private:
702 	const tcu::IVec2					m_localSize;
703 	const tcu::IVec2					m_imageSize;
704 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
705 };
706 
CopyImageToSSBOTest(tcu::TestContext & testCtx,const std::string & name,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)707 CopyImageToSSBOTest::CopyImageToSSBOTest (tcu::TestContext&		testCtx,
708 										  const std::string&	name,
709 										  const tcu::IVec2&		localSize,
710 										  const tcu::IVec2&		imageSize,
711 										  const vk::ComputePipelineConstructionType computePipelineConstructionType)
712 	: TestCase							(testCtx, name)
713 	, m_localSize						(localSize)
714 	, m_imageSize						(imageSize)
715 	, m_computePipelineConstructionType	(computePipelineConstructionType)
716 {
717 	DE_ASSERT(m_imageSize.x() % m_localSize.x() == 0);
718 	DE_ASSERT(m_imageSize.y() % m_localSize.y() == 0);
719 }
720 
checkSupport(Context & context) const721 void CopyImageToSSBOTest::checkSupport (Context& context) const
722 {
723 	checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(), m_computePipelineConstructionType);
724 }
725 
initPrograms(SourceCollections & sourceCollections) const726 void CopyImageToSSBOTest::initPrograms (SourceCollections& sourceCollections) const
727 {
728 	std::ostringstream src;
729 	src << "#version 310 es\n"
730 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ") in;\n"
731 		<< "layout(binding = 1, r32ui) readonly uniform highp uimage2D u_srcImg;\n"
732 		<< "layout(binding = 0) writeonly buffer Output {\n"
733 		<< "    uint values[" << (m_imageSize.x() * m_imageSize.y()) << "];\n"
734 		<< "} sb_out;\n\n"
735 		<< "void main (void) {\n"
736 		<< "    uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
737 		<< "    uint value  = imageLoad(u_srcImg, ivec2(gl_GlobalInvocationID.xy)).x;\n"
738 		<< "    sb_out.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x] = value;\n"
739 		<< "}\n";
740 
741 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
742 }
743 
createInstance(Context & context) const744 TestInstance* CopyImageToSSBOTest::createInstance (Context& context) const
745 {
746 	return new CopyImageToSSBOTestInstance(context, m_localSize, m_imageSize, m_computePipelineConstructionType);
747 }
748 
CopyImageToSSBOTestInstance(Context & context,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)749 CopyImageToSSBOTestInstance::CopyImageToSSBOTestInstance (Context& context, const tcu::IVec2& localSize, const tcu::IVec2& imageSize, const vk::ComputePipelineConstructionType computePipelineConstructionType)
750 	: TestInstance						(context)
751 	, m_localSize						(localSize)
752 	, m_imageSize						(imageSize)
753 	, m_computePipelineConstructionType	(computePipelineConstructionType)
754 {
755 }
756 
iterate(void)757 tcu::TestStatus CopyImageToSSBOTestInstance::iterate (void)
758 {
759 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
760 	const VkDevice			device				= m_context.getDevice();
761 	const VkQueue			queue				= m_context.getUniversalQueue();
762 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
763 	Allocator&				allocator			= m_context.getDefaultAllocator();
764 
765 	// Create an image
766 
767 	const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
768 	const ImageWithMemory image(vk, device, allocator, imageParams, MemoryRequirement::Any);
769 
770 	const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
771 	const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
772 
773 	// Staging buffer (source data for image)
774 
775 	const deUint32 imageArea = multiplyComponents(m_imageSize);
776 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * imageArea;
777 
778 	const BufferWithMemory stagingBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_SRC_BIT), MemoryRequirement::HostVisible);
779 
780 	// Populate the staging buffer with test data
781 	{
782 		de::Random rnd(0xab2c7);
783 		const Allocation& stagingBufferAllocation = stagingBuffer.getAllocation();
784 		deUint32* bufferPtr = static_cast<deUint32*>(stagingBufferAllocation.getHostPtr());
785 		for (deUint32 i = 0; i < imageArea; ++i)
786 			*bufferPtr++ = rnd.getUint32();
787 
788 		flushAlloc(vk, device, stagingBufferAllocation);
789 	}
790 
791 	// Create a buffer to store shader output
792 
793 	const BufferWithMemory outputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
794 
795 	// Create descriptor set
796 
797 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
798 		DescriptorSetLayoutBuilder()
799 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
800 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
801 		.build(vk, device));
802 
803 	const Unique<VkDescriptorPool> descriptorPool(
804 		DescriptorPoolBuilder()
805 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
806 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
807 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
808 
809 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
810 
811 	// Set the bindings
812 
813 	const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
814 	const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, bufferSizeBytes);
815 
816 	DescriptorSetUpdateBuilder()
817 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
818 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
819 		.update(vk, device);
820 
821 	// Perform the computation
822 	{
823 		ComputePipelineWrapper			pipeline(vk, device, m_computePipelineConstructionType, m_context.getBinaryCollection().get("comp"));
824 		pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
825 		pipeline.buildPipeline();
826 
827 		const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, bufferSizeBytes);
828 		const tcu::IVec2 workSize = m_imageSize / m_localSize;
829 
830 		// Prepare the command buffer
831 
832 		const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
833 		const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
834 
835 		// Start recording commands
836 
837 		beginCommandBuffer(vk, *cmdBuffer);
838 
839 		pipeline.bind(*cmdBuffer);
840 		vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
841 
842 		const std::vector<VkBufferImageCopy> bufferImageCopy(1, makeBufferImageCopy(m_imageSize));
843 		copyBufferToImage(vk, *cmdBuffer, *stagingBuffer, bufferSizeBytes, bufferImageCopy, VK_IMAGE_ASPECT_COLOR_BIT, 1, 1, *image, VK_IMAGE_LAYOUT_GENERAL, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
844 
845 		vk.cmdDispatch(*cmdBuffer, workSize.x(), workSize.y(), 1u);
846 		vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
847 
848 		endCommandBuffer(vk, *cmdBuffer);
849 
850 		// Wait for completion
851 
852 		submitCommandsAndWait(vk, device, queue, *cmdBuffer);
853 	}
854 
855 	// Validate the results
856 
857 	const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
858 	invalidateAlloc(vk, device, outputBufferAllocation);
859 
860 	const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
861 	const deUint32* refBufferPtr = static_cast<deUint32*>(stagingBuffer.getAllocation().getHostPtr());
862 
863 	for (deUint32 ndx = 0; ndx < imageArea; ++ndx)
864 	{
865 		const deUint32 res = *(bufferPtr + ndx);
866 		const deUint32 ref = *(refBufferPtr + ndx);
867 
868 		if (res != ref)
869 		{
870 			std::ostringstream msg;
871 			msg << "Comparison failed for Output.values[" << ndx << "]";
872 			return tcu::TestStatus::fail(msg.str());
873 		}
874 	}
875 	return tcu::TestStatus::pass("Compute succeeded");
876 }
877 
878 class CopySSBOToImageTest : public vkt::TestCase
879 {
880 public:
881 						CopySSBOToImageTest	(tcu::TestContext&	testCtx,
882 											 const std::string&	name,
883 											 const tcu::IVec2&	localSize,
884 											 const tcu::IVec2&	imageSize,
885 											 const vk::ComputePipelineConstructionType computePipelineConstructionType);
886 
887 	virtual void		checkSupport		(Context& context) const;
888 	void				initPrograms		(SourceCollections& sourceCollections) const;
889 	TestInstance*		createInstance		(Context&			context) const;
890 
891 private:
892 	const tcu::IVec2					m_localSize;
893 	const tcu::IVec2					m_imageSize;
894 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
895 };
896 
897 class CopySSBOToImageTestInstance : public vkt::TestInstance
898 {
899 public:
900 									CopySSBOToImageTestInstance	(Context&			context,
901 																 const tcu::IVec2&	localSize,
902 																 const tcu::IVec2&	imageSize,
903 																 const vk::ComputePipelineConstructionType computePipelineConstructionType);
904 
905 	tcu::TestStatus					iterate						(void);
906 
907 private:
908 	const tcu::IVec2					m_localSize;
909 	const tcu::IVec2					m_imageSize;
910 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
911 };
912 
CopySSBOToImageTest(tcu::TestContext & testCtx,const std::string & name,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)913 CopySSBOToImageTest::CopySSBOToImageTest (tcu::TestContext&		testCtx,
914 										  const std::string&	name,
915 										  const tcu::IVec2&		localSize,
916 										  const tcu::IVec2&		imageSize,
917 										  const vk::ComputePipelineConstructionType computePipelineConstructionType)
918 	: TestCase							(testCtx, name)
919 	, m_localSize						(localSize)
920 	, m_imageSize						(imageSize)
921 	, m_computePipelineConstructionType	(computePipelineConstructionType)
922 {
923 	DE_ASSERT(m_imageSize.x() % m_localSize.x() == 0);
924 	DE_ASSERT(m_imageSize.y() % m_localSize.y() == 0);
925 }
926 
checkSupport(Context & context) const927 void CopySSBOToImageTest::checkSupport (Context& context) const
928 {
929 	checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(), m_computePipelineConstructionType);
930 }
931 
initPrograms(SourceCollections & sourceCollections) const932 void CopySSBOToImageTest::initPrograms (SourceCollections& sourceCollections) const
933 {
934 	std::ostringstream src;
935 	src << "#version 310 es\n"
936 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ") in;\n"
937 		<< "layout(binding = 1, r32ui) writeonly uniform highp uimage2D u_dstImg;\n"
938 		<< "layout(binding = 0) readonly buffer Input {\n"
939 		<< "    uint values[" << (m_imageSize.x() * m_imageSize.y()) << "];\n"
940 		<< "} sb_in;\n\n"
941 		<< "void main (void) {\n"
942 		<< "    uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
943 		<< "    uint value  = sb_in.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x];\n"
944 		<< "    imageStore(u_dstImg, ivec2(gl_GlobalInvocationID.xy), uvec4(value, 0, 0, 0));\n"
945 		<< "}\n";
946 
947 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
948 }
949 
createInstance(Context & context) const950 TestInstance* CopySSBOToImageTest::createInstance (Context& context) const
951 {
952 	return new CopySSBOToImageTestInstance(context, m_localSize, m_imageSize, m_computePipelineConstructionType);
953 }
954 
CopySSBOToImageTestInstance(Context & context,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)955 CopySSBOToImageTestInstance::CopySSBOToImageTestInstance (Context& context, const tcu::IVec2& localSize, const tcu::IVec2& imageSize, const vk::ComputePipelineConstructionType computePipelineConstructionType)
956 	: TestInstance						(context)
957 	, m_localSize						(localSize)
958 	, m_imageSize						(imageSize)
959 	, m_computePipelineConstructionType	(computePipelineConstructionType)
960 {
961 }
962 
iterate(void)963 tcu::TestStatus CopySSBOToImageTestInstance::iterate (void)
964 {
965 	ContextCommonData		data	= m_context.getContextCommonData();
966 	const DeviceInterface&	vkd		= data.vkd;
967 
968 	// Create an image, a view, and the output buffer
969 	const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
970 	ImageWithBuffer imageWithBuffer(vkd, data.device, data.allocator, vk::makeExtent3D(m_imageSize.x(), m_imageSize.y(), 1),
971 		VK_FORMAT_R32_UINT, VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_STORAGE_BIT, vk::VK_IMAGE_TYPE_2D,
972 		subresourceRange);
973 
974 	const deUint32 imageArea = multiplyComponents(m_imageSize);
975 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * imageArea;
976 
977 	const BufferWithMemory inputBuffer(vkd, data.device, data.allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
978 
979 	// Populate the buffer with test data
980 	{
981 		de::Random rnd(0x77238ac2);
982 		const Allocation& inputBufferAllocation = inputBuffer.getAllocation();
983 		deUint32* bufferPtr = static_cast<deUint32*>(inputBufferAllocation.getHostPtr());
984 		for (deUint32 i = 0; i < imageArea; ++i)
985 			*bufferPtr++ = rnd.getUint32();
986 
987 		flushAlloc(vkd, data.device, inputBufferAllocation);
988 	}
989 
990 	// Create descriptor set
991 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
992 		DescriptorSetLayoutBuilder()
993 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
994 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
995 		.build(vkd, data.device));
996 
997 	const Unique<VkDescriptorPool> descriptorPool(
998 		DescriptorPoolBuilder()
999 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
1000 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
1001 		.build(vkd, data.device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1002 
1003 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vkd, data.device, *descriptorPool, *descriptorSetLayout));
1004 
1005 	// Set the bindings
1006 
1007 	const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, imageWithBuffer.getImageView(), VK_IMAGE_LAYOUT_GENERAL);
1008 	const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, bufferSizeBytes);
1009 
1010 	DescriptorSetUpdateBuilder()
1011 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
1012 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
1013 		.update(vkd, data.device);
1014 
1015 	// Perform the computation
1016 	{
1017 		ComputePipelineWrapper			pipeline(vkd, data.device, m_computePipelineConstructionType, m_context.getBinaryCollection().get("comp"));
1018 		pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
1019 		pipeline.buildPipeline();
1020 
1021 		const VkBufferMemoryBarrier inputBufferPostHostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, bufferSizeBytes);
1022 
1023 		const VkImageMemoryBarrier imageLayoutBarrier = makeImageMemoryBarrier(
1024 			0u, VK_ACCESS_SHADER_WRITE_BIT,
1025 			VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
1026 			imageWithBuffer.getImage(), subresourceRange);
1027 
1028 		const tcu::IVec2 workSize = m_imageSize / m_localSize;
1029 
1030 		// Prepare the command buffer
1031 
1032 		const Unique<VkCommandPool> cmdPool(makeCommandPool(vkd, data.device, data.qfIndex));
1033 		const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vkd, data.device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1034 
1035 		// Start recording commands
1036 
1037 		beginCommandBuffer(vkd, *cmdBuffer);
1038 
1039 		pipeline.bind(*cmdBuffer);
1040 		vkd.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1041 
1042 		vkd.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &inputBufferPostHostWriteBarrier, 1, &imageLayoutBarrier);
1043 		vkd.cmdDispatch(*cmdBuffer, workSize.x(), workSize.y(), 1u);
1044 
1045 		copyImageToBuffer(vkd, *cmdBuffer, imageWithBuffer.getImage(), imageWithBuffer.getBuffer(), m_imageSize, VK_ACCESS_SHADER_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL);
1046 
1047 		endCommandBuffer(vkd, *cmdBuffer);
1048 
1049 		// Wait for completion
1050 
1051 		submitCommandsAndWait(vkd, data.device, data.queue, *cmdBuffer);
1052 	}
1053 
1054 	// Validate the results
1055 
1056 	const Allocation& outputBufferAllocation = imageWithBuffer.getBufferAllocation();
1057 	invalidateAlloc(vkd, data.device, outputBufferAllocation);
1058 
1059 	const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
1060 	const deUint32* refBufferPtr = static_cast<deUint32*>(inputBuffer.getAllocation().getHostPtr());
1061 
1062 	for (deUint32 ndx = 0; ndx < imageArea; ++ndx)
1063 	{
1064 		const deUint32 res = *(bufferPtr + ndx);
1065 		const deUint32 ref = *(refBufferPtr + ndx);
1066 
1067 		if (res != ref)
1068 		{
1069 			std::ostringstream msg;
1070 			msg << "Comparison failed for pixel " << ndx;
1071 			return tcu::TestStatus::fail(msg.str());
1072 		}
1073 	}
1074 	return tcu::TestStatus::pass("Compute succeeded");
1075 }
1076 
1077 class BufferToBufferInvertTest : public vkt::TestCase
1078 {
1079 public:
1080 	virtual void						checkSupport				(Context& context) const;
1081 	void								initPrograms				(SourceCollections&	sourceCollections) const;
1082 	TestInstance*						createInstance				(Context&			context) const;
1083 
1084 	static BufferToBufferInvertTest*	UBOToSSBOInvertCase			(tcu::TestContext&	testCtx,
1085 																	 const std::string& name,
1086 																	 const deUint32		numValues,
1087 																	 const tcu::IVec3&	localSize,
1088 																	 const tcu::IVec3&	workSize,
1089 																	 const vk::ComputePipelineConstructionType computePipelineConstructionType);
1090 
1091 	static BufferToBufferInvertTest*	CopyInvertSSBOCase			(tcu::TestContext&	testCtx,
1092 																	 const std::string& name,
1093 																	 const deUint32		numValues,
1094 																	 const tcu::IVec3&	localSize,
1095 																	 const tcu::IVec3&	workSize,
1096 																	 const vk::ComputePipelineConstructionType computePipelineConstructionType);
1097 
1098 private:
1099 										BufferToBufferInvertTest	(tcu::TestContext&	testCtx,
1100 																	 const std::string& name,
1101 																	 const deUint32		numValues,
1102 																	 const tcu::IVec3&	localSize,
1103 																	 const tcu::IVec3&	workSize,
1104 																	 const BufferType	bufferType,
1105 																	 const vk::ComputePipelineConstructionType computePipelineConstructionType);
1106 
1107 	const BufferType					m_bufferType;
1108 	const deUint32						m_numValues;
1109 	const tcu::IVec3					m_localSize;
1110 	const tcu::IVec3					m_workSize;
1111 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
1112 };
1113 
1114 class BufferToBufferInvertTestInstance : public vkt::TestInstance
1115 {
1116 public:
1117 									BufferToBufferInvertTestInstance	(Context&			context,
1118 																		 const deUint32		numValues,
1119 																		 const tcu::IVec3&	localSize,
1120 																		 const tcu::IVec3&	workSize,
1121 																		 const BufferType	bufferType,
1122 																		 const vk::ComputePipelineConstructionType computePipelineConstructionType);
1123 
1124 	tcu::TestStatus					iterate								(void);
1125 
1126 private:
1127 	const BufferType					m_bufferType;
1128 	const deUint32						m_numValues;
1129 	const tcu::IVec3					m_localSize;
1130 	const tcu::IVec3					m_workSize;
1131 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
1132 };
1133 
BufferToBufferInvertTest(tcu::TestContext & testCtx,const std::string & name,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const BufferType bufferType,const vk::ComputePipelineConstructionType computePipelineConstructionType)1134 BufferToBufferInvertTest::BufferToBufferInvertTest (tcu::TestContext&	testCtx,
1135 													const std::string&	name,
1136 													const deUint32		numValues,
1137 													const tcu::IVec3&	localSize,
1138 													const tcu::IVec3&	workSize,
1139 													const BufferType	bufferType,
1140 													const vk::ComputePipelineConstructionType computePipelineConstructionType)
1141 	: TestCase							(testCtx, name)
1142 	, m_bufferType						(bufferType)
1143 	, m_numValues						(numValues)
1144 	, m_localSize						(localSize)
1145 	, m_workSize						(workSize)
1146 	, m_computePipelineConstructionType	(computePipelineConstructionType)
1147 {
1148 	DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1149 	DE_ASSERT(m_bufferType == BUFFER_TYPE_UNIFORM || m_bufferType == BUFFER_TYPE_SSBO);
1150 }
1151 
UBOToSSBOInvertCase(tcu::TestContext & testCtx,const std::string & name,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)1152 BufferToBufferInvertTest* BufferToBufferInvertTest::UBOToSSBOInvertCase (tcu::TestContext&	testCtx,
1153 																		 const std::string&	name,
1154 																		 const deUint32		numValues,
1155 																		 const tcu::IVec3&	localSize,
1156 																		 const tcu::IVec3&	workSize,
1157 																		 const vk::ComputePipelineConstructionType computePipelineConstructionType)
1158 {
1159 	return new BufferToBufferInvertTest(testCtx, name, numValues, localSize, workSize, BUFFER_TYPE_UNIFORM, computePipelineConstructionType);
1160 }
1161 
CopyInvertSSBOCase(tcu::TestContext & testCtx,const std::string & name,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)1162 BufferToBufferInvertTest* BufferToBufferInvertTest::CopyInvertSSBOCase (tcu::TestContext&	testCtx,
1163 																		const std::string&	name,
1164 																		const deUint32		numValues,
1165 																		const tcu::IVec3&	localSize,
1166 																		const tcu::IVec3&	workSize,
1167 																		const vk::ComputePipelineConstructionType computePipelineConstructionType)
1168 {
1169 	return new BufferToBufferInvertTest(testCtx, name, numValues, localSize, workSize, BUFFER_TYPE_SSBO, computePipelineConstructionType);
1170 }
1171 
checkSupport(Context & context) const1172 void BufferToBufferInvertTest::checkSupport (Context& context) const
1173 {
1174 	checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(), m_computePipelineConstructionType);
1175 }
1176 
initPrograms(SourceCollections & sourceCollections) const1177 void BufferToBufferInvertTest::initPrograms (SourceCollections& sourceCollections) const
1178 {
1179 	std::ostringstream src;
1180 	if (m_bufferType == BUFFER_TYPE_UNIFORM)
1181 	{
1182 		src << "#version 310 es\n"
1183 			<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1184 			<< "layout(binding = 0) readonly uniform Input {\n"
1185 			<< "    uint values[" << m_numValues << "];\n"
1186 			<< "} ub_in;\n"
1187 			<< "layout(binding = 1, std140) writeonly buffer Output {\n"
1188 			<< "    uint values[" << m_numValues << "];\n"
1189 			<< "} sb_out;\n"
1190 			<< "void main (void) {\n"
1191 			<< "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1192 			<< "    uint numValuesPerInv = uint(ub_in.values.length()) / (size.x*size.y*size.z);\n"
1193 			<< "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1194 			<< "    uint offset          = numValuesPerInv*groupNdx;\n"
1195 			<< "\n"
1196 			<< "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1197 			<< "        sb_out.values[offset + ndx] = ~ub_in.values[offset + ndx];\n"
1198 			<< "}\n";
1199 	}
1200 	else if (m_bufferType == BUFFER_TYPE_SSBO)
1201 	{
1202 		src << "#version 310 es\n"
1203 			<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1204 			<< "layout(binding = 0, std140) readonly buffer Input {\n"
1205 			<< "    uint values[" << m_numValues << "];\n"
1206 			<< "} sb_in;\n"
1207 			<< "layout (binding = 1, std140) writeonly buffer Output {\n"
1208 			<< "    uint values[" << m_numValues << "];\n"
1209 			<< "} sb_out;\n"
1210 			<< "void main (void) {\n"
1211 			<< "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1212 			<< "    uint numValuesPerInv = uint(sb_in.values.length()) / (size.x*size.y*size.z);\n"
1213 			<< "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1214 			<< "    uint offset          = numValuesPerInv*groupNdx;\n"
1215 			<< "\n"
1216 			<< "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1217 			<< "        sb_out.values[offset + ndx] = ~sb_in.values[offset + ndx];\n"
1218 			<< "}\n";
1219 	}
1220 
1221 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1222 }
1223 
createInstance(Context & context) const1224 TestInstance* BufferToBufferInvertTest::createInstance (Context& context) const
1225 {
1226 	return new BufferToBufferInvertTestInstance(context, m_numValues, m_localSize, m_workSize, m_bufferType, m_computePipelineConstructionType);
1227 }
1228 
BufferToBufferInvertTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const BufferType bufferType,const vk::ComputePipelineConstructionType computePipelineConstructionType)1229 BufferToBufferInvertTestInstance::BufferToBufferInvertTestInstance (Context&			context,
1230 																	const deUint32		numValues,
1231 																	const tcu::IVec3&	localSize,
1232 																	const tcu::IVec3&	workSize,
1233 																	const BufferType	bufferType,
1234 																	const vk::ComputePipelineConstructionType computePipelineConstructionType)
1235 	: TestInstance						(context)
1236 	, m_bufferType						(bufferType)
1237 	, m_numValues						(numValues)
1238 	, m_localSize						(localSize)
1239 	, m_workSize						(workSize)
1240 	, m_computePipelineConstructionType	(computePipelineConstructionType)
1241 {
1242 }
1243 
iterate(void)1244 tcu::TestStatus BufferToBufferInvertTestInstance::iterate (void)
1245 {
1246 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
1247 	const VkDevice			device				= m_context.getDevice();
1248 	const VkQueue			queue				= m_context.getUniversalQueue();
1249 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
1250 	Allocator&				allocator			= m_context.getDefaultAllocator();
1251 
1252 	// Customize the test based on buffer type
1253 
1254 	const VkBufferUsageFlags	inputBufferUsageFlags		= (m_bufferType == BUFFER_TYPE_UNIFORM ? VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT : VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
1255 	const VkDescriptorType		inputBufferDescriptorType	= (m_bufferType == BUFFER_TYPE_UNIFORM ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1256 	const deUint32				randomSeed					= (m_bufferType == BUFFER_TYPE_UNIFORM ? 0x111223f : 0x124fef);
1257 
1258 	// Create an input buffer
1259 
1260 	const VkDeviceSize		bufferSizeBytes = sizeof(tcu::UVec4) * m_numValues;
1261 	const BufferWithMemory	inputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, inputBufferUsageFlags), MemoryRequirement::HostVisible);
1262 
1263 	// Fill the input buffer with data
1264 	{
1265 		de::Random rnd(randomSeed);
1266 		const Allocation& inputBufferAllocation = inputBuffer.getAllocation();
1267 		tcu::UVec4* bufferPtr = static_cast<tcu::UVec4*>(inputBufferAllocation.getHostPtr());
1268 		for (deUint32 i = 0; i < m_numValues; ++i)
1269 			bufferPtr[i].x() = rnd.getUint32();
1270 
1271 		flushAlloc(vk, device, inputBufferAllocation);
1272 	}
1273 
1274 	// Create an output buffer
1275 
1276 	const BufferWithMemory outputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1277 
1278 	// Create descriptor set
1279 
1280 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1281 		DescriptorSetLayoutBuilder()
1282 		.addSingleBinding(inputBufferDescriptorType, VK_SHADER_STAGE_COMPUTE_BIT)
1283 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1284 		.build(vk, device));
1285 
1286 	const Unique<VkDescriptorPool> descriptorPool(
1287 		DescriptorPoolBuilder()
1288 		.addType(inputBufferDescriptorType)
1289 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
1290 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1291 
1292 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1293 
1294 	const VkDescriptorBufferInfo inputBufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, bufferSizeBytes);
1295 	const VkDescriptorBufferInfo outputBufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, bufferSizeBytes);
1296 	DescriptorSetUpdateBuilder()
1297 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), inputBufferDescriptorType, &inputBufferDescriptorInfo)
1298 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
1299 		.update(vk, device);
1300 
1301 	// Perform the computation
1302 
1303 	ComputePipelineWrapper			pipeline(vk, device, m_computePipelineConstructionType, m_context.getBinaryCollection().get("comp"));
1304 	pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
1305 	pipeline.buildPipeline();
1306 
1307 	const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, bufferSizeBytes);
1308 
1309 	const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, bufferSizeBytes);
1310 
1311 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1312 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1313 
1314 	// Start recording commands
1315 
1316 	beginCommandBuffer(vk, *cmdBuffer);
1317 
1318 	pipeline.bind(*cmdBuffer);
1319 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1320 
1321 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1322 	vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1323 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1324 
1325 	endCommandBuffer(vk, *cmdBuffer);
1326 
1327 	// Wait for completion
1328 
1329 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1330 
1331 	// Validate the results
1332 
1333 	const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1334 	invalidateAlloc(vk, device, outputBufferAllocation);
1335 
1336 	const tcu::UVec4* bufferPtr = static_cast<tcu::UVec4*>(outputBufferAllocation.getHostPtr());
1337 	const tcu::UVec4* refBufferPtr = static_cast<tcu::UVec4*>(inputBuffer.getAllocation().getHostPtr());
1338 
1339 	for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1340 	{
1341 		const deUint32 res = bufferPtr[ndx].x();
1342 		const deUint32 ref = ~refBufferPtr[ndx].x();
1343 
1344 		if (res != ref)
1345 		{
1346 			std::ostringstream msg;
1347 			msg << "Comparison failed for Output.values[" << ndx << "]";
1348 			return tcu::TestStatus::fail(msg.str());
1349 		}
1350 	}
1351 	return tcu::TestStatus::pass("Compute succeeded");
1352 }
1353 
1354 class InvertSSBOInPlaceTest : public vkt::TestCase
1355 {
1356 public:
1357 						InvertSSBOInPlaceTest	(tcu::TestContext&	testCtx,
1358 												 const std::string&	name,
1359 												 const deUint32		numValues,
1360 												 const bool			sized,
1361 												 const tcu::IVec3&	localSize,
1362 												 const tcu::IVec3&	workSize,
1363 												 const vk::ComputePipelineConstructionType computePipelineConstructionType);
1364 
1365 	virtual void		checkSupport			(Context& context) const;
1366 	void				initPrograms			(SourceCollections& sourceCollections) const;
1367 	TestInstance*		createInstance			(Context&			context) const;
1368 
1369 private:
1370 	const deUint32						m_numValues;
1371 	const bool							m_sized;
1372 	const tcu::IVec3					m_localSize;
1373 	const tcu::IVec3					m_workSize;
1374 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
1375 };
1376 
1377 class InvertSSBOInPlaceTestInstance : public vkt::TestInstance
1378 {
1379 public:
1380 									InvertSSBOInPlaceTestInstance	(Context&			context,
1381 																	 const deUint32		numValues,
1382 																	 const tcu::IVec3&	localSize,
1383 																	 const tcu::IVec3&	workSize,
1384 																	 const vk::ComputePipelineConstructionType computePipelineConstructionType);
1385 
1386 	tcu::TestStatus					iterate							(void);
1387 
1388 private:
1389 	const deUint32					m_numValues;
1390 	const tcu::IVec3				m_localSize;
1391 	const tcu::IVec3				m_workSize;
1392 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
1393 };
1394 
InvertSSBOInPlaceTest(tcu::TestContext & testCtx,const std::string & name,const deUint32 numValues,const bool sized,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)1395 InvertSSBOInPlaceTest::InvertSSBOInPlaceTest (tcu::TestContext&		testCtx,
1396 											  const std::string&	name,
1397 											  const deUint32		numValues,
1398 											  const bool			sized,
1399 											  const tcu::IVec3&		localSize,
1400 											  const tcu::IVec3&		workSize,
1401 											  const vk::ComputePipelineConstructionType computePipelineConstructionType)
1402 	: TestCase							(testCtx, name)
1403 	, m_numValues						(numValues)
1404 	, m_sized							(sized)
1405 	, m_localSize						(localSize)
1406 	, m_workSize						(workSize)
1407 	, m_computePipelineConstructionType	(computePipelineConstructionType)
1408 {
1409 	DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1410 }
1411 
checkSupport(Context & context) const1412 void InvertSSBOInPlaceTest::checkSupport (Context& context) const
1413 {
1414 	checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(), m_computePipelineConstructionType);
1415 }
1416 
initPrograms(SourceCollections & sourceCollections) const1417 void InvertSSBOInPlaceTest::initPrograms (SourceCollections& sourceCollections) const
1418 {
1419 	std::ostringstream src;
1420 	src << "#version 310 es\n"
1421 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1422 		<< "layout(binding = 0) buffer InOut {\n"
1423 		<< "    uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1424 		<< "} sb_inout;\n"
1425 		<< "void main (void) {\n"
1426 		<< "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1427 		<< "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
1428 		<< "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1429 		<< "    uint offset          = numValuesPerInv*groupNdx;\n"
1430 		<< "\n"
1431 		<< "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1432 		<< "        sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
1433 		<< "}\n";
1434 
1435 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1436 }
1437 
createInstance(Context & context) const1438 TestInstance* InvertSSBOInPlaceTest::createInstance (Context& context) const
1439 {
1440 	return new InvertSSBOInPlaceTestInstance(context, m_numValues, m_localSize, m_workSize, m_computePipelineConstructionType);
1441 }
1442 
InvertSSBOInPlaceTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)1443 InvertSSBOInPlaceTestInstance::InvertSSBOInPlaceTestInstance (Context&			context,
1444 															  const deUint32	numValues,
1445 															  const tcu::IVec3&	localSize,
1446 															  const tcu::IVec3&	workSize,
1447 															  const vk::ComputePipelineConstructionType computePipelineConstructionType)
1448 	: TestInstance						(context)
1449 	, m_numValues						(numValues)
1450 	, m_localSize						(localSize)
1451 	, m_workSize						(workSize)
1452 	, m_computePipelineConstructionType	(computePipelineConstructionType)
1453 {
1454 }
1455 
iterate(void)1456 tcu::TestStatus InvertSSBOInPlaceTestInstance::iterate (void)
1457 {
1458 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
1459 	const VkDevice			device				= m_context.getDevice();
1460 	const VkQueue			queue				= m_context.getUniversalQueue();
1461 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
1462 	Allocator&				allocator			= m_context.getDefaultAllocator();
1463 
1464 	// Create an input/output buffer
1465 
1466 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
1467 	const BufferWithMemory buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1468 
1469 	// Fill the buffer with data
1470 
1471 	typedef std::vector<deUint32> data_vector_t;
1472 	data_vector_t inputData(m_numValues);
1473 
1474 	{
1475 		de::Random rnd(0x82ce7f);
1476 		const Allocation& bufferAllocation = buffer.getAllocation();
1477 		deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
1478 		for (deUint32 i = 0; i < m_numValues; ++i)
1479 			inputData[i] = *bufferPtr++ = rnd.getUint32();
1480 
1481 		flushAlloc(vk, device, bufferAllocation);
1482 	}
1483 
1484 	// Create descriptor set
1485 
1486 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1487 		DescriptorSetLayoutBuilder()
1488 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1489 		.build(vk, device));
1490 
1491 	const Unique<VkDescriptorPool> descriptorPool(
1492 		DescriptorPoolBuilder()
1493 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
1494 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1495 
1496 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1497 
1498 	const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
1499 	DescriptorSetUpdateBuilder()
1500 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
1501 		.update(vk, device);
1502 
1503 	// Perform the computation
1504 
1505 	ComputePipelineWrapper			pipeline	(vk, device, m_computePipelineConstructionType, m_context.getBinaryCollection().get("comp"));
1506 	pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
1507 	pipeline.buildPipeline();
1508 
1509 	const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer, 0ull, bufferSizeBytes);
1510 
1511 	const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
1512 
1513 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1514 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1515 
1516 	// Start recording commands
1517 
1518 	beginCommandBuffer(vk, *cmdBuffer);
1519 
1520 	pipeline.bind(*cmdBuffer);
1521 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1522 
1523 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1524 	vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1525 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1526 
1527 	endCommandBuffer(vk, *cmdBuffer);
1528 
1529 	// Wait for completion
1530 
1531 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1532 
1533 	// Validate the results
1534 
1535 	const Allocation& bufferAllocation = buffer.getAllocation();
1536 	invalidateAlloc(vk, device, bufferAllocation);
1537 
1538 	const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
1539 
1540 	for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1541 	{
1542 		const deUint32 res = bufferPtr[ndx];
1543 		const deUint32 ref = ~inputData[ndx];
1544 
1545 		if (res != ref)
1546 		{
1547 			std::ostringstream msg;
1548 			msg << "Comparison failed for InOut.values[" << ndx << "]";
1549 			return tcu::TestStatus::fail(msg.str());
1550 		}
1551 	}
1552 	return tcu::TestStatus::pass("Compute succeeded");
1553 }
1554 
1555 class WriteToMultipleSSBOTest : public vkt::TestCase
1556 {
1557 public:
1558 						WriteToMultipleSSBOTest	(tcu::TestContext&	testCtx,
1559 												 const std::string&	name,
1560 												 const deUint32		numValues,
1561 												 const bool			sized,
1562 												 const tcu::IVec3&	localSize,
1563 												 const tcu::IVec3&	workSize,
1564 												 const vk::ComputePipelineConstructionType computePipelineConstructionType);
1565 
1566 	virtual void		checkSupport			(Context& context) const;
1567 	void				initPrograms			(SourceCollections& sourceCollections) const;
1568 	TestInstance*		createInstance			(Context&			context) const;
1569 
1570 private:
1571 	const deUint32						m_numValues;
1572 	const bool							m_sized;
1573 	const tcu::IVec3					m_localSize;
1574 	const tcu::IVec3					m_workSize;
1575 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
1576 };
1577 
1578 class WriteToMultipleSSBOTestInstance : public vkt::TestInstance
1579 {
1580 public:
1581 									WriteToMultipleSSBOTestInstance	(Context&			context,
1582 																	 const deUint32		numValues,
1583 																	 const tcu::IVec3&	localSize,
1584 																	 const tcu::IVec3&	workSize,
1585 																	 const vk::ComputePipelineConstructionType computePipelineConstructionType);
1586 
1587 	tcu::TestStatus					iterate							(void);
1588 
1589 private:
1590 	const deUint32						m_numValues;
1591 	const tcu::IVec3					m_localSize;
1592 	const tcu::IVec3					m_workSize;
1593 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
1594 };
1595 
WriteToMultipleSSBOTest(tcu::TestContext & testCtx,const std::string & name,const deUint32 numValues,const bool sized,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)1596 WriteToMultipleSSBOTest::WriteToMultipleSSBOTest (tcu::TestContext&		testCtx,
1597 												  const std::string&	name,
1598 												  const deUint32		numValues,
1599 												  const bool			sized,
1600 												  const tcu::IVec3&		localSize,
1601 												  const tcu::IVec3&		workSize,
1602 												  const vk::ComputePipelineConstructionType computePipelineConstructionType)
1603 	: TestCase							(testCtx, name)
1604 	, m_numValues						(numValues)
1605 	, m_sized							(sized)
1606 	, m_localSize						(localSize)
1607 	, m_workSize						(workSize)
1608 	, m_computePipelineConstructionType	(computePipelineConstructionType)
1609 {
1610 	DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1611 }
1612 
checkSupport(Context & context) const1613 void WriteToMultipleSSBOTest::checkSupport (Context& context) const
1614 {
1615 	checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(), m_computePipelineConstructionType);
1616 }
1617 
initPrograms(SourceCollections & sourceCollections) const1618 void WriteToMultipleSSBOTest::initPrograms (SourceCollections& sourceCollections) const
1619 {
1620 	std::ostringstream src;
1621 	src << "#version 310 es\n"
1622 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1623 		<< "layout(binding = 0) writeonly buffer Out0 {\n"
1624 		<< "    uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1625 		<< "} sb_out0;\n"
1626 		<< "layout(binding = 1) writeonly buffer Out1 {\n"
1627 		<< "    uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1628 		<< "} sb_out1;\n"
1629 		<< "void main (void) {\n"
1630 		<< "    uvec3 size      = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1631 		<< "    uint groupNdx   = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1632 		<< "\n"
1633 		<< "    {\n"
1634 		<< "        uint numValuesPerInv = uint(sb_out0.values.length()) / (size.x*size.y*size.z);\n"
1635 		<< "        uint offset          = numValuesPerInv*groupNdx;\n"
1636 		<< "\n"
1637 		<< "        for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1638 		<< "            sb_out0.values[offset + ndx] = offset + ndx;\n"
1639 		<< "    }\n"
1640 		<< "    {\n"
1641 		<< "        uint numValuesPerInv = uint(sb_out1.values.length()) / (size.x*size.y*size.z);\n"
1642 		<< "        uint offset          = numValuesPerInv*groupNdx;\n"
1643 		<< "\n"
1644 		<< "        for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1645 		<< "            sb_out1.values[offset + ndx] = uint(sb_out1.values.length()) - offset - ndx;\n"
1646 		<< "    }\n"
1647 		<< "}\n";
1648 
1649 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1650 }
1651 
createInstance(Context & context) const1652 TestInstance* WriteToMultipleSSBOTest::createInstance (Context& context) const
1653 {
1654 	return new WriteToMultipleSSBOTestInstance(context, m_numValues, m_localSize, m_workSize, m_computePipelineConstructionType);
1655 }
1656 
WriteToMultipleSSBOTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)1657 WriteToMultipleSSBOTestInstance::WriteToMultipleSSBOTestInstance (Context&			context,
1658 																  const deUint32	numValues,
1659 																  const tcu::IVec3&	localSize,
1660 																  const tcu::IVec3&	workSize,
1661 																  const vk::ComputePipelineConstructionType computePipelineConstructionType)
1662 	: TestInstance						(context)
1663 	, m_numValues						(numValues)
1664 	, m_localSize						(localSize)
1665 	, m_workSize						(workSize)
1666 	, m_computePipelineConstructionType	(computePipelineConstructionType)
1667 {
1668 }
1669 
iterate(void)1670 tcu::TestStatus WriteToMultipleSSBOTestInstance::iterate (void)
1671 {
1672 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
1673 	const VkDevice			device				= m_context.getDevice();
1674 	const VkQueue			queue				= m_context.getUniversalQueue();
1675 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
1676 	Allocator&				allocator			= m_context.getDefaultAllocator();
1677 
1678 	// Create two output buffers
1679 
1680 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
1681 	const BufferWithMemory buffer0(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1682 	const BufferWithMemory buffer1(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1683 
1684 	// Create descriptor set
1685 
1686 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1687 		DescriptorSetLayoutBuilder()
1688 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1689 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1690 		.build(vk, device));
1691 
1692 	const Unique<VkDescriptorPool> descriptorPool(
1693 		DescriptorPoolBuilder()
1694 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2u)
1695 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1696 
1697 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1698 
1699 	const VkDescriptorBufferInfo buffer0DescriptorInfo = makeDescriptorBufferInfo(*buffer0, 0ull, bufferSizeBytes);
1700 	const VkDescriptorBufferInfo buffer1DescriptorInfo = makeDescriptorBufferInfo(*buffer1, 0ull, bufferSizeBytes);
1701 	DescriptorSetUpdateBuilder()
1702 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &buffer0DescriptorInfo)
1703 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &buffer1DescriptorInfo)
1704 		.update(vk, device);
1705 
1706 	// Perform the computation
1707 
1708 	ComputePipelineWrapper		pipeline(vk, device, m_computePipelineConstructionType, m_context.getBinaryCollection().get("comp"));
1709 	pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
1710 	pipeline.buildPipeline();
1711 
1712 	const VkBufferMemoryBarrier shaderWriteBarriers[] =
1713 	{
1714 		makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer0, 0ull, bufferSizeBytes),
1715 		makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer1, 0ull, bufferSizeBytes)
1716 	};
1717 
1718 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1719 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1720 
1721 	// Start recording commands
1722 
1723 	beginCommandBuffer(vk, *cmdBuffer);
1724 
1725 	pipeline.bind(*cmdBuffer);
1726 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1727 
1728 	vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1729 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, DE_LENGTH_OF_ARRAY(shaderWriteBarriers), shaderWriteBarriers, 0, (const VkImageMemoryBarrier*)DE_NULL);
1730 
1731 	endCommandBuffer(vk, *cmdBuffer);
1732 
1733 	// Wait for completion
1734 
1735 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1736 
1737 	// Validate the results
1738 	{
1739 		const Allocation& buffer0Allocation = buffer0.getAllocation();
1740 		invalidateAlloc(vk, device, buffer0Allocation);
1741 		const deUint32* buffer0Ptr = static_cast<deUint32*>(buffer0Allocation.getHostPtr());
1742 
1743 		for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1744 		{
1745 			const deUint32 res = buffer0Ptr[ndx];
1746 			const deUint32 ref = ndx;
1747 
1748 			if (res != ref)
1749 			{
1750 				std::ostringstream msg;
1751 				msg << "Comparison failed for Out0.values[" << ndx << "] res=" << res << " ref=" << ref;
1752 				return tcu::TestStatus::fail(msg.str());
1753 			}
1754 		}
1755 	}
1756 	{
1757 		const Allocation& buffer1Allocation = buffer1.getAllocation();
1758 		invalidateAlloc(vk, device, buffer1Allocation);
1759 		const deUint32* buffer1Ptr = static_cast<deUint32*>(buffer1Allocation.getHostPtr());
1760 
1761 		for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1762 		{
1763 			const deUint32 res = buffer1Ptr[ndx];
1764 			const deUint32 ref = m_numValues - ndx;
1765 
1766 			if (res != ref)
1767 			{
1768 				std::ostringstream msg;
1769 				msg << "Comparison failed for Out1.values[" << ndx << "] res=" << res << " ref=" << ref;
1770 				return tcu::TestStatus::fail(msg.str());
1771 			}
1772 		}
1773 	}
1774 	return tcu::TestStatus::pass("Compute succeeded");
1775 }
1776 
1777 class SSBOBarrierTest : public vkt::TestCase
1778 {
1779 public:
1780 						SSBOBarrierTest		(tcu::TestContext&	testCtx,
1781 											 const std::string&	name,
1782 											 const tcu::IVec3&	workSize,
1783 											 const vk::ComputePipelineConstructionType computePipelineConstructionType);
1784 
1785 	virtual void		checkSupport		(Context& context) const;
1786 	void				initPrograms		(SourceCollections& sourceCollections) const;
1787 	TestInstance*		createInstance		(Context&			context) const;
1788 
1789 private:
1790 	const tcu::IVec3					m_workSize;
1791 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
1792 };
1793 
1794 class SSBOBarrierTestInstance : public vkt::TestInstance
1795 {
1796 public:
1797 									SSBOBarrierTestInstance		(Context&			context,
1798 																 const tcu::IVec3&	workSize,
1799 																 const vk::ComputePipelineConstructionType computePipelineConstructionType);
1800 
1801 	tcu::TestStatus					iterate						(void);
1802 
1803 private:
1804 	const tcu::IVec3					m_workSize;
1805 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
1806 };
1807 
SSBOBarrierTest(tcu::TestContext & testCtx,const std::string & name,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)1808 SSBOBarrierTest::SSBOBarrierTest (tcu::TestContext&		testCtx,
1809 								  const std::string&	name,
1810 								  const tcu::IVec3&		workSize,
1811 								  const vk::ComputePipelineConstructionType computePipelineConstructionType)
1812 	: TestCase		(testCtx, name)
1813 	, m_workSize	(workSize)
1814 	, m_computePipelineConstructionType(computePipelineConstructionType)
1815 {
1816 }
1817 
checkSupport(Context & context) const1818 void SSBOBarrierTest::checkSupport (Context& context) const
1819 {
1820 	checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(), m_computePipelineConstructionType);
1821 }
1822 
initPrograms(SourceCollections & sourceCollections) const1823 void SSBOBarrierTest::initPrograms (SourceCollections& sourceCollections) const
1824 {
1825 	sourceCollections.glslSources.add("comp0") << glu::ComputeSource(
1826 		"#version 310 es\n"
1827 		"layout (local_size_x = 1) in;\n"
1828 		"layout(binding = 2) readonly uniform Constants {\n"
1829 		"    uint u_baseVal;\n"
1830 		"};\n"
1831 		"layout(binding = 1) writeonly buffer Output {\n"
1832 		"    uint values[];\n"
1833 		"};\n"
1834 		"void main (void) {\n"
1835 		"    uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
1836 		"    values[offset] = u_baseVal + offset;\n"
1837 		"}\n");
1838 
1839 	sourceCollections.glslSources.add("comp1") << glu::ComputeSource(
1840 		"#version 310 es\n"
1841 		"layout (local_size_x = 1) in;\n"
1842 		"layout(binding = 1) readonly buffer Input {\n"
1843 		"    uint values[];\n"
1844 		"};\n"
1845 		"layout(binding = 0) coherent buffer Output {\n"
1846 		"    uint sum;\n"
1847 		"};\n"
1848 		"void main (void) {\n"
1849 		"    uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
1850 		"    uint value  = values[offset];\n"
1851 		"    atomicAdd(sum, value);\n"
1852 		"}\n");
1853 }
1854 
createInstance(Context & context) const1855 TestInstance* SSBOBarrierTest::createInstance (Context& context) const
1856 {
1857 	return new SSBOBarrierTestInstance(context, m_workSize, m_computePipelineConstructionType);
1858 }
1859 
SSBOBarrierTestInstance(Context & context,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)1860 SSBOBarrierTestInstance::SSBOBarrierTestInstance (Context& context, const tcu::IVec3& workSize, const vk::ComputePipelineConstructionType computePipelineConstructionType)
1861 	: TestInstance	(context)
1862 	, m_workSize	(workSize)
1863 	, m_computePipelineConstructionType(computePipelineConstructionType)
1864 {
1865 }
1866 
iterate(void)1867 tcu::TestStatus SSBOBarrierTestInstance::iterate (void)
1868 {
1869 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
1870 	const VkDevice			device				= m_context.getDevice();
1871 	const VkQueue			queue				= m_context.getUniversalQueue();
1872 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
1873 	Allocator&				allocator			= m_context.getDefaultAllocator();
1874 
1875 	// Create a work buffer used by both shaders
1876 
1877 	const int workGroupCount = multiplyComponents(m_workSize);
1878 	const VkDeviceSize workBufferSizeBytes = sizeof(deUint32) * workGroupCount;
1879 	const BufferWithMemory workBuffer(vk, device, allocator, makeBufferCreateInfo(workBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::Any);
1880 
1881 	// Create an output buffer
1882 
1883 	const VkDeviceSize outputBufferSizeBytes = sizeof(deUint32);
1884 	const BufferWithMemory outputBuffer(vk, device, allocator, makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1885 
1886 	// Initialize atomic counter value to zero
1887 	{
1888 		const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1889 		deUint32* outputBufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
1890 		*outputBufferPtr = 0;
1891 		flushAlloc(vk, device, outputBufferAllocation);
1892 	}
1893 
1894 	// Create a uniform buffer (to pass uniform constants)
1895 
1896 	const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32);
1897 	const BufferWithMemory uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
1898 
1899 	// Set the constants in the uniform buffer
1900 
1901 	const deUint32	baseValue = 127;
1902 	{
1903 		const Allocation& uniformBufferAllocation = uniformBuffer.getAllocation();
1904 		deUint32* uniformBufferPtr = static_cast<deUint32*>(uniformBufferAllocation.getHostPtr());
1905 		uniformBufferPtr[0] = baseValue;
1906 
1907 		flushAlloc(vk, device, uniformBufferAllocation);
1908 	}
1909 
1910 	// Create descriptor set
1911 
1912 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1913 		DescriptorSetLayoutBuilder()
1914 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1915 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1916 		.addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1917 		.build(vk, device));
1918 
1919 	const Unique<VkDescriptorPool> descriptorPool(
1920 		DescriptorPoolBuilder()
1921 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2u)
1922 		.addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
1923 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1924 
1925 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1926 
1927 	const VkDescriptorBufferInfo workBufferDescriptorInfo = makeDescriptorBufferInfo(*workBuffer, 0ull, workBufferSizeBytes);
1928 	const VkDescriptorBufferInfo outputBufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, outputBufferSizeBytes);
1929 	const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
1930 	DescriptorSetUpdateBuilder()
1931 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
1932 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &workBufferDescriptorInfo)
1933 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(2u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
1934 		.update(vk, device);
1935 
1936 	// Perform the computation
1937 
1938 	ComputePipelineWrapper			pipeline0(vk, device, m_computePipelineConstructionType, m_context.getBinaryCollection().get("comp0"));
1939 	pipeline0.setDescriptorSetLayout(descriptorSetLayout.get());
1940 	pipeline0.buildPipeline();
1941 
1942 	ComputePipelineWrapper			pipeline1(vk, device, m_computePipelineConstructionType, m_context.getBinaryCollection().get("comp1"));
1943 	pipeline1.setDescriptorSetLayout(descriptorSetLayout.get());
1944 	pipeline1.buildPipeline();
1945 
1946 	const VkBufferMemoryBarrier writeUniformConstantsBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
1947 
1948 	const VkBufferMemoryBarrier betweenShadersBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *workBuffer, 0ull, workBufferSizeBytes);
1949 
1950 	const VkBufferMemoryBarrier afterComputeBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, outputBufferSizeBytes);
1951 
1952 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1953 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1954 
1955 	// Start recording commands
1956 
1957 	beginCommandBuffer(vk, *cmdBuffer);
1958 
1959 	pipeline0.bind(*cmdBuffer);
1960 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline0.getPipelineLayout(), 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1961 
1962 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &writeUniformConstantsBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1963 
1964 	vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1965 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &betweenShadersBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1966 
1967 	// Switch to the second shader program
1968 	pipeline1.bind(*cmdBuffer);
1969 
1970 	vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1971 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &afterComputeBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1972 
1973 	endCommandBuffer(vk, *cmdBuffer);
1974 
1975 	// Wait for completion
1976 
1977 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1978 
1979 	// Validate the results
1980 
1981 	const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1982 	invalidateAlloc(vk, device, outputBufferAllocation);
1983 
1984 	const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
1985 	const deUint32	res = *bufferPtr;
1986 	deUint32		ref = 0;
1987 
1988 	for (int ndx = 0; ndx < workGroupCount; ++ndx)
1989 		ref += baseValue + ndx;
1990 
1991 	if (res != ref)
1992 	{
1993 		std::ostringstream msg;
1994 		msg << "ERROR: comparison failed, expected " << ref << ", got " << res;
1995 		return tcu::TestStatus::fail(msg.str());
1996 	}
1997 	return tcu::TestStatus::pass("Compute succeeded");
1998 }
1999 
2000 class ImageAtomicOpTest : public vkt::TestCase
2001 {
2002 public:
2003 						ImageAtomicOpTest		(tcu::TestContext&	testCtx,
2004 												 const std::string& name,
2005 												 const deUint32		localSize,
2006 												 const tcu::IVec2&	imageSize,
2007 												 const vk::ComputePipelineConstructionType computePipelineConstructionType);
2008 
2009 	virtual void		checkSupport			(Context& context) const;
2010 	void				initPrograms			(SourceCollections& sourceCollections) const;
2011 	TestInstance*		createInstance			(Context&			context) const;
2012 
2013 private:
2014 	const deUint32						m_localSize;
2015 	const tcu::IVec2					m_imageSize;
2016 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
2017 };
2018 
2019 class ImageAtomicOpTestInstance : public vkt::TestInstance
2020 {
2021 public:
2022 									ImageAtomicOpTestInstance		(Context&			context,
2023 																	 const deUint32		localSize,
2024 																	 const tcu::IVec2&	imageSize,
2025 																	 const vk::ComputePipelineConstructionType computePipelineConstructionType);
2026 
2027 	tcu::TestStatus					iterate							(void);
2028 
2029 private:
2030 	const deUint32						m_localSize;
2031 	const tcu::IVec2					m_imageSize;
2032 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
2033 };
2034 
ImageAtomicOpTest(tcu::TestContext & testCtx,const std::string & name,const deUint32 localSize,const tcu::IVec2 & imageSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)2035 ImageAtomicOpTest::ImageAtomicOpTest (tcu::TestContext&		testCtx,
2036 									  const std::string&	name,
2037 									  const deUint32		localSize,
2038 									  const tcu::IVec2&		imageSize,
2039 	const vk::ComputePipelineConstructionType computePipelineConstructionType)
2040 	: TestCase							(testCtx, name)
2041 	, m_localSize						(localSize)
2042 	, m_imageSize						(imageSize)
2043 	, m_computePipelineConstructionType	(computePipelineConstructionType)
2044 {
2045 }
2046 
checkSupport(Context & context) const2047 void ImageAtomicOpTest::checkSupport (Context& context) const
2048 {
2049 	checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(), m_computePipelineConstructionType);
2050 }
2051 
initPrograms(SourceCollections & sourceCollections) const2052 void ImageAtomicOpTest::initPrograms (SourceCollections& sourceCollections) const
2053 {
2054 	std::ostringstream src;
2055 	src << "#version 310 es\n"
2056 		<< "#extension GL_OES_shader_image_atomic : require\n"
2057 		<< "layout (local_size_x = " << m_localSize << ") in;\n"
2058 		<< "layout(binding = 1, r32ui) coherent uniform highp uimage2D u_dstImg;\n"
2059 		<< "layout(binding = 0) readonly buffer Input {\n"
2060 		<< "    uint values[" << (multiplyComponents(m_imageSize) * m_localSize) << "];\n"
2061 		<< "} sb_in;\n\n"
2062 		<< "void main (void) {\n"
2063 		<< "    uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
2064 		<< "    uint value  = sb_in.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x];\n"
2065 		<< "\n"
2066 		<< "    if (gl_LocalInvocationIndex == 0u)\n"
2067 		<< "        imageStore(u_dstImg, ivec2(gl_WorkGroupID.xy), uvec4(0));\n"
2068 		<< "    memoryBarrierImage();\n"
2069 		<< "    barrier();\n"
2070 		<< "    imageAtomicAdd(u_dstImg, ivec2(gl_WorkGroupID.xy), value);\n"
2071 		<< "}\n";
2072 
2073 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
2074 }
2075 
createInstance(Context & context) const2076 TestInstance* ImageAtomicOpTest::createInstance (Context& context) const
2077 {
2078 	return new ImageAtomicOpTestInstance(context, m_localSize, m_imageSize, m_computePipelineConstructionType);
2079 }
2080 
ImageAtomicOpTestInstance(Context & context,const deUint32 localSize,const tcu::IVec2 & imageSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)2081 ImageAtomicOpTestInstance::ImageAtomicOpTestInstance (Context& context, const deUint32 localSize, const tcu::IVec2& imageSize, const vk::ComputePipelineConstructionType computePipelineConstructionType)
2082 	: TestInstance	(context)
2083 	, m_localSize	(localSize)
2084 	, m_imageSize	(imageSize)
2085 	, m_computePipelineConstructionType(computePipelineConstructionType)
2086 {
2087 }
2088 
iterate(void)2089 tcu::TestStatus ImageAtomicOpTestInstance::iterate (void)
2090 {
2091 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
2092 	const VkDevice			device				= m_context.getDevice();
2093 	const VkQueue			queue				= m_context.getUniversalQueue();
2094 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
2095 	Allocator&				allocator			= m_context.getDefaultAllocator();
2096 
2097 	// Create an image
2098 
2099 	const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
2100 	const ImageWithMemory image(vk, device, allocator, imageParams, MemoryRequirement::Any);
2101 
2102 	const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
2103 	const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
2104 
2105 	// Input buffer
2106 
2107 	const deUint32 numInputValues = multiplyComponents(m_imageSize) * m_localSize;
2108 	const VkDeviceSize inputBufferSizeBytes = sizeof(deUint32) * numInputValues;
2109 
2110 	const BufferWithMemory inputBuffer(vk, device, allocator, makeBufferCreateInfo(inputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
2111 
2112 	// Populate the input buffer with test data
2113 	{
2114 		de::Random rnd(0x77238ac2);
2115 		const Allocation& inputBufferAllocation = inputBuffer.getAllocation();
2116 		deUint32* bufferPtr = static_cast<deUint32*>(inputBufferAllocation.getHostPtr());
2117 		for (deUint32 i = 0; i < numInputValues; ++i)
2118 			*bufferPtr++ = rnd.getUint32();
2119 
2120 		flushAlloc(vk, device, inputBufferAllocation);
2121 	}
2122 
2123 	// Create a buffer to store shader output (copied from image data)
2124 
2125 	const deUint32 imageArea = multiplyComponents(m_imageSize);
2126 	const VkDeviceSize outputBufferSizeBytes = sizeof(deUint32) * imageArea;
2127 	const BufferWithMemory outputBuffer(vk, device, allocator, makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT), MemoryRequirement::HostVisible);
2128 
2129 	// Create descriptor set
2130 
2131 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2132 		DescriptorSetLayoutBuilder()
2133 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2134 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
2135 		.build(vk, device));
2136 
2137 	const Unique<VkDescriptorPool> descriptorPool(
2138 		DescriptorPoolBuilder()
2139 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2140 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
2141 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2142 
2143 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2144 
2145 	// Set the bindings
2146 
2147 	const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
2148 	const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, inputBufferSizeBytes);
2149 
2150 	DescriptorSetUpdateBuilder()
2151 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2152 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
2153 		.update(vk, device);
2154 
2155 	// Perform the computation
2156 	{
2157 		ComputePipelineWrapper			pipeline(vk, device, m_computePipelineConstructionType, m_context.getBinaryCollection().get("comp"));
2158 		pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
2159 		pipeline.buildPipeline();
2160 
2161 		const VkBufferMemoryBarrier inputBufferPostHostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, inputBufferSizeBytes);
2162 
2163 		const VkImageMemoryBarrier imageLayoutBarrier = makeImageMemoryBarrier(
2164 			(VkAccessFlags)0, VK_ACCESS_SHADER_WRITE_BIT,
2165 			VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
2166 			*image, subresourceRange);
2167 
2168 		// Prepare the command buffer
2169 
2170 		const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
2171 		const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2172 
2173 		// Start recording commands
2174 
2175 		beginCommandBuffer(vk, *cmdBuffer);
2176 
2177 		pipeline.bind(*cmdBuffer);
2178 		vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2179 
2180 		vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &inputBufferPostHostWriteBarrier, 1, &imageLayoutBarrier);
2181 		vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2182 
2183 		copyImageToBuffer(vk, *cmdBuffer, *image, *outputBuffer, m_imageSize, VK_ACCESS_SHADER_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL);
2184 
2185 		endCommandBuffer(vk, *cmdBuffer);
2186 
2187 		// Wait for completion
2188 
2189 		submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2190 	}
2191 
2192 	// Validate the results
2193 
2194 	const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
2195 	invalidateAlloc(vk, device, outputBufferAllocation);
2196 
2197 	const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
2198 	const deUint32* refBufferPtr = static_cast<deUint32*>(inputBuffer.getAllocation().getHostPtr());
2199 
2200 	for (deUint32 pixelNdx = 0; pixelNdx < imageArea; ++pixelNdx)
2201 	{
2202 		const deUint32	res = bufferPtr[pixelNdx];
2203 		deUint32		ref = 0;
2204 
2205 		for (deUint32 offs = 0; offs < m_localSize; ++offs)
2206 			ref += refBufferPtr[pixelNdx * m_localSize + offs];
2207 
2208 		if (res != ref)
2209 		{
2210 			std::ostringstream msg;
2211 			msg << "Comparison failed for pixel " << pixelNdx;
2212 			return tcu::TestStatus::fail(msg.str());
2213 		}
2214 	}
2215 	return tcu::TestStatus::pass("Compute succeeded");
2216 }
2217 
2218 class ImageBarrierTest : public vkt::TestCase
2219 {
2220 public:
2221 						ImageBarrierTest	(tcu::TestContext&	testCtx,
2222 											const std::string&	name,
2223 											const tcu::IVec2&	imageSize,
2224 											const vk::ComputePipelineConstructionType computePipelineConstructionType);
2225 
2226 	virtual void		checkSupport		(Context& context) const;
2227 	void				initPrograms		(SourceCollections& sourceCollections) const;
2228 	TestInstance*		createInstance		(Context&			context) const;
2229 
2230 private:
2231 	const tcu::IVec2					m_imageSize;
2232 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
2233 };
2234 
2235 class ImageBarrierTestInstance : public vkt::TestInstance
2236 {
2237 public:
2238 									ImageBarrierTestInstance	(Context&			context,
2239 																 const tcu::IVec2&	imageSize,
2240 																 const vk::ComputePipelineConstructionType computePipelineConstructionType);
2241 
2242 	tcu::TestStatus					iterate						(void);
2243 
2244 private:
2245 	const tcu::IVec2					m_imageSize;
2246 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
2247 };
2248 
ImageBarrierTest(tcu::TestContext & testCtx,const std::string & name,const tcu::IVec2 & imageSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)2249 ImageBarrierTest::ImageBarrierTest (tcu::TestContext&	testCtx,
2250 									const std::string&	name,
2251 									const tcu::IVec2&	imageSize,
2252 									const vk::ComputePipelineConstructionType computePipelineConstructionType)
2253 	: TestCase							(testCtx, name)
2254 	, m_imageSize						(imageSize)
2255 	, m_computePipelineConstructionType	(computePipelineConstructionType)
2256 {
2257 }
2258 
checkSupport(Context & context) const2259 void ImageBarrierTest::checkSupport (Context& context) const
2260 {
2261 	checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(), m_computePipelineConstructionType);
2262 }
2263 
initPrograms(SourceCollections & sourceCollections) const2264 void ImageBarrierTest::initPrograms (SourceCollections& sourceCollections) const
2265 {
2266 	sourceCollections.glslSources.add("comp0") << glu::ComputeSource(
2267 		"#version 310 es\n"
2268 		"layout (local_size_x = 1) in;\n"
2269 		"layout(binding = 2) readonly uniform Constants {\n"
2270 		"    uint u_baseVal;\n"
2271 		"};\n"
2272 		"layout(binding = 1, r32ui) writeonly uniform highp uimage2D u_img;\n"
2273 		"void main (void) {\n"
2274 		"    uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
2275 		"    imageStore(u_img, ivec2(gl_WorkGroupID.xy), uvec4(offset + u_baseVal, 0, 0, 0));\n"
2276 		"}\n");
2277 
2278 	sourceCollections.glslSources.add("comp1") << glu::ComputeSource(
2279 		"#version 310 es\n"
2280 		"layout (local_size_x = 1) in;\n"
2281 		"layout(binding = 1, r32ui) readonly uniform highp uimage2D u_img;\n"
2282 		"layout(binding = 0) coherent buffer Output {\n"
2283 		"    uint sum;\n"
2284 		"};\n"
2285 		"void main (void) {\n"
2286 		"    uint value = imageLoad(u_img, ivec2(gl_WorkGroupID.xy)).x;\n"
2287 		"    atomicAdd(sum, value);\n"
2288 		"}\n");
2289 }
2290 
createInstance(Context & context) const2291 TestInstance* ImageBarrierTest::createInstance (Context& context) const
2292 {
2293 	return new ImageBarrierTestInstance(context, m_imageSize, m_computePipelineConstructionType);
2294 }
2295 
ImageBarrierTestInstance(Context & context,const tcu::IVec2 & imageSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)2296 ImageBarrierTestInstance::ImageBarrierTestInstance (Context& context, const tcu::IVec2& imageSize, const vk::ComputePipelineConstructionType computePipelineConstructionType)
2297 	: TestInstance						(context)
2298 	, m_imageSize						(imageSize)
2299 	, m_computePipelineConstructionType	(computePipelineConstructionType)
2300 {
2301 }
2302 
iterate(void)2303 tcu::TestStatus ImageBarrierTestInstance::iterate (void)
2304 {
2305 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
2306 	const VkDevice			device				= m_context.getDevice();
2307 	const VkQueue			queue				= m_context.getUniversalQueue();
2308 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
2309 	Allocator&				allocator			= m_context.getDefaultAllocator();
2310 
2311 	// Create an image used by both shaders
2312 
2313 	const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_STORAGE_BIT);
2314 	const ImageWithMemory image(vk, device, allocator, imageParams, MemoryRequirement::Any);
2315 
2316 	const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
2317 	const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
2318 
2319 	// Create an output buffer
2320 
2321 	const VkDeviceSize outputBufferSizeBytes = sizeof(deUint32);
2322 	const BufferWithMemory outputBuffer(vk, device, allocator, makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
2323 
2324 	// Initialize atomic counter value to zero
2325 	{
2326 		const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
2327 		deUint32* outputBufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
2328 		*outputBufferPtr = 0;
2329 		flushAlloc(vk, device, outputBufferAllocation);
2330 	}
2331 
2332 	// Create a uniform buffer (to pass uniform constants)
2333 
2334 	const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32);
2335 	const BufferWithMemory uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
2336 
2337 	// Set the constants in the uniform buffer
2338 
2339 	const deUint32	baseValue = 127;
2340 	{
2341 		const Allocation& uniformBufferAllocation = uniformBuffer.getAllocation();
2342 		deUint32* uniformBufferPtr = static_cast<deUint32*>(uniformBufferAllocation.getHostPtr());
2343 		uniformBufferPtr[0] = baseValue;
2344 
2345 		flushAlloc(vk, device, uniformBufferAllocation);
2346 	}
2347 
2348 	// Create descriptor set
2349 
2350 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2351 		DescriptorSetLayoutBuilder()
2352 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2353 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
2354 		.addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2355 		.build(vk, device));
2356 
2357 	const Unique<VkDescriptorPool> descriptorPool(
2358 		DescriptorPoolBuilder()
2359 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2360 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
2361 		.addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2362 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2363 
2364 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2365 
2366 	const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
2367 	const VkDescriptorBufferInfo outputBufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, outputBufferSizeBytes);
2368 	const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2369 	DescriptorSetUpdateBuilder()
2370 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
2371 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
2372 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(2u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2373 		.update(vk, device);
2374 
2375 	// Perform the computation
2376 
2377 	ComputePipelineWrapper			pipeline0(vk, device, m_computePipelineConstructionType, m_context.getBinaryCollection().get("comp0"));
2378 	pipeline0.setDescriptorSetLayout(descriptorSetLayout.get());
2379 	pipeline0.buildPipeline();
2380 	ComputePipelineWrapper			pipeline1(vk, device, m_computePipelineConstructionType, m_context.getBinaryCollection().get("comp1"));
2381 	pipeline1.setDescriptorSetLayout(descriptorSetLayout.get());
2382 	pipeline1.buildPipeline();
2383 
2384 	const VkBufferMemoryBarrier writeUniformConstantsBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2385 
2386 	const VkImageMemoryBarrier imageLayoutBarrier = makeImageMemoryBarrier(
2387 		0u, 0u,
2388 		VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
2389 		*image, subresourceRange);
2390 
2391 	const VkImageMemoryBarrier imageBarrierBetweenShaders = makeImageMemoryBarrier(
2392 		VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT,
2393 		VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_GENERAL,
2394 		*image, subresourceRange);
2395 
2396 	const VkBufferMemoryBarrier afterComputeBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, outputBufferSizeBytes);
2397 
2398 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
2399 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2400 
2401 	// Start recording commands
2402 
2403 	beginCommandBuffer(vk, *cmdBuffer);
2404 
2405 	pipeline0.bind(*cmdBuffer);
2406 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline0.getPipelineLayout(), 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2407 
2408 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &writeUniformConstantsBarrier, 1, &imageLayoutBarrier);
2409 
2410 	vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2411 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 0, (const VkBufferMemoryBarrier*)DE_NULL, 1, &imageBarrierBetweenShaders);
2412 
2413 	// Switch to the second shader program
2414 	pipeline1.bind(*cmdBuffer);
2415 
2416 	vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2417 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &afterComputeBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2418 
2419 	endCommandBuffer(vk, *cmdBuffer);
2420 
2421 	// Wait for completion
2422 
2423 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2424 
2425 	// Validate the results
2426 
2427 	const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
2428 	invalidateAlloc(vk, device, outputBufferAllocation);
2429 
2430 	const int		numValues = multiplyComponents(m_imageSize);
2431 	const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
2432 	const deUint32	res = *bufferPtr;
2433 	deUint32		ref = 0;
2434 
2435 	for (int ndx = 0; ndx < numValues; ++ndx)
2436 		ref += baseValue + ndx;
2437 
2438 	if (res != ref)
2439 	{
2440 		std::ostringstream msg;
2441 		msg << "ERROR: comparison failed, expected " << ref << ", got " << res;
2442 		return tcu::TestStatus::fail(msg.str());
2443 	}
2444 	return tcu::TestStatus::pass("Compute succeeded");
2445 }
2446 
2447 class ComputeTestInstance : public vkt::TestInstance
2448 {
2449 public:
ComputeTestInstance(Context & context,vk::ComputePipelineConstructionType computePipelineConstructionType)2450 		ComputeTestInstance		(Context& context, vk::ComputePipelineConstructionType computePipelineConstructionType)
2451 		: TestInstance						(context)
2452 		, m_numPhysDevices					(1)
2453 		, m_queueFamilyIndex				(0)
2454 		, m_computePipelineConstructionType	(computePipelineConstructionType)
2455 	{
2456 		createDeviceGroup();
2457 	}
2458 
~ComputeTestInstance()2459 		~ComputeTestInstance	()
2460 	{
2461 	}
2462 
2463 	void							createDeviceGroup	(void);
getDeviceInterface(void)2464 	const vk::DeviceInterface&		getDeviceInterface	(void)			{ return *m_deviceDriver; }
getInstance(void)2465 	vk::VkInstance					getInstance			(void)			{ return m_deviceGroupInstance; }
getDevice(void)2466 	vk::VkDevice					getDevice			(void)			{ return *m_logicalDevice; }
getPhysicalDevice(deUint32 i=0)2467 	vk::VkPhysicalDevice			getPhysicalDevice	(deUint32 i = 0){ return m_physicalDevices[i]; }
2468 
2469 protected:
2470 	deUint32							m_numPhysDevices;
2471 	deUint32							m_queueFamilyIndex;
2472 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
2473 
2474 private:
2475 	CustomInstance						m_deviceGroupInstance;
2476 	vk::Move<vk::VkDevice>				m_logicalDevice;
2477 	std::vector<vk::VkPhysicalDevice>	m_physicalDevices;
2478 #ifndef CTS_USES_VULKANSC
2479 	de::MovePtr<vk::DeviceDriver>		m_deviceDriver;
2480 #else
2481 	de::MovePtr<vk::DeviceDriverSC, vk::DeinitDeviceDeleter>	m_deviceDriver;
2482 #endif // CTS_USES_VULKANSC
2483 };
2484 
createDeviceGroup(void)2485 void ComputeTestInstance::createDeviceGroup (void)
2486 {
2487 	const tcu::CommandLine&							cmdLine					= m_context.getTestContext().getCommandLine();
2488 	const deUint32									devGroupIdx				= cmdLine.getVKDeviceGroupId() - 1;
2489 	const deUint32									physDeviceIdx			= cmdLine.getVKDeviceId() - 1;
2490 	const float										queuePriority			= 1.0f;
2491 	const std::vector<std::string>					requiredExtensions		(1, "VK_KHR_device_group_creation");
2492 	m_deviceGroupInstance													= createCustomInstanceWithExtensions(m_context, requiredExtensions);
2493 	std::vector<VkPhysicalDeviceGroupProperties>	devGroupProperties		= enumeratePhysicalDeviceGroups(m_context.getInstanceInterface(), m_deviceGroupInstance);
2494 	m_numPhysDevices														= devGroupProperties[devGroupIdx].physicalDeviceCount;
2495 	std::vector<const char*>						deviceExtensions;
2496 
2497 	if (!isCoreDeviceExtension(m_context.getUsedApiVersion(), "VK_KHR_device_group"))
2498 		deviceExtensions.push_back("VK_KHR_device_group");
2499 
2500 	VkDeviceGroupDeviceCreateInfo					deviceGroupInfo			=
2501 	{
2502 		VK_STRUCTURE_TYPE_DEVICE_GROUP_DEVICE_CREATE_INFO,									//stype
2503 		DE_NULL,																			//pNext
2504 		devGroupProperties[devGroupIdx].physicalDeviceCount,								//physicalDeviceCount
2505 		devGroupProperties[devGroupIdx].physicalDevices										//physicalDevices
2506 	};
2507 	const InstanceDriver&							instance				(m_deviceGroupInstance.getDriver());
2508 	VkPhysicalDeviceFeatures2						deviceFeatures2			= initVulkanStructure();
2509 	const VkPhysicalDeviceFeatures					deviceFeatures			= getPhysicalDeviceFeatures(instance, deviceGroupInfo.pPhysicalDevices[physDeviceIdx]);
2510 	const std::vector<VkQueueFamilyProperties>		queueProps				= getPhysicalDeviceQueueFamilyProperties(instance, devGroupProperties[devGroupIdx].physicalDevices[physDeviceIdx]);
2511 
2512 	deviceFeatures2.features = deviceFeatures;
2513 
2514 #ifndef CTS_USES_VULKANSC
2515 	VkPhysicalDeviceDynamicRenderingFeaturesKHR		dynamicRenderingFeatures	= initVulkanStructure();
2516 	dynamicRenderingFeatures.dynamicRendering = VK_TRUE;
2517 	VkPhysicalDeviceShaderObjectFeaturesEXT			shaderObjectFeatures		= initVulkanStructure(&dynamicRenderingFeatures);
2518 	shaderObjectFeatures.shaderObject = VK_TRUE;
2519 	if (m_computePipelineConstructionType)
2520 	{
2521 		deviceExtensions.push_back("VK_EXT_shader_object");
2522 		deviceFeatures2.pNext = &shaderObjectFeatures;
2523 	}
2524 #endif
2525 
2526 	m_physicalDevices.resize(m_numPhysDevices);
2527 	for (deUint32 physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2528 		m_physicalDevices[physDevIdx] = devGroupProperties[devGroupIdx].physicalDevices[physDevIdx];
2529 
2530 	for (size_t queueNdx = 0; queueNdx < queueProps.size(); queueNdx++)
2531 	{
2532 		if (queueProps[queueNdx].queueFlags & VK_QUEUE_COMPUTE_BIT)
2533 			m_queueFamilyIndex = (deUint32)queueNdx;
2534 	}
2535 
2536 	VkDeviceQueueCreateInfo							queueInfo				=
2537 	{
2538 		VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,		// VkStructureType					sType;
2539 		DE_NULL,										// const void*						pNext;
2540 		(VkDeviceQueueCreateFlags)0u,					// VkDeviceQueueCreateFlags			flags;
2541 		m_queueFamilyIndex,								// deUint32							queueFamilyIndex;
2542 		1u,												// deUint32							queueCount;
2543 		&queuePriority									// const float*						pQueuePriorities;
2544 	};
2545 
2546 	void* pNext												= &deviceGroupInfo;
2547 	if (deviceFeatures2.pNext != DE_NULL)
2548 		deviceGroupInfo.pNext = &deviceFeatures2;
2549 
2550 #ifdef CTS_USES_VULKANSC
2551 	VkDeviceObjectReservationCreateInfo memReservationInfo	= cmdLine.isSubProcess() ? m_context.getResourceInterface()->getStatMax() : resetDeviceObjectReservationCreateInfo();
2552 	memReservationInfo.pNext								= pNext;
2553 	pNext													= &memReservationInfo;
2554 
2555 	VkPhysicalDeviceVulkanSC10Features sc10Features			= createDefaultSC10Features();
2556 	sc10Features.pNext										= pNext;
2557 	pNext													= &sc10Features;
2558 	VkPipelineCacheCreateInfo			pcCI;
2559 	std::vector<VkPipelinePoolSize>		poolSizes;
2560 	if (cmdLine.isSubProcess())
2561 	{
2562 		if (m_context.getResourceInterface()->getCacheDataSize() > 0)
2563 		{
2564 			pcCI =
2565 			{
2566 				VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO,			// VkStructureType				sType;
2567 				DE_NULL,												// const void*					pNext;
2568 				VK_PIPELINE_CACHE_CREATE_READ_ONLY_BIT |
2569 					VK_PIPELINE_CACHE_CREATE_USE_APPLICATION_STORAGE_BIT,	// VkPipelineCacheCreateFlags	flags;
2570 				m_context.getResourceInterface()->getCacheDataSize(),	// deUintptr					initialDataSize;
2571 				m_context.getResourceInterface()->getCacheData()		// const void*					pInitialData;
2572 			};
2573 			memReservationInfo.pipelineCacheCreateInfoCount		= 1;
2574 			memReservationInfo.pPipelineCacheCreateInfos		= &pcCI;
2575 		}
2576 
2577 		poolSizes							= m_context.getResourceInterface()->getPipelinePoolSizes();
2578 		if (!poolSizes.empty())
2579 		{
2580 			memReservationInfo.pipelinePoolSizeCount		= deUint32(poolSizes.size());
2581 			memReservationInfo.pPipelinePoolSizes			= poolSizes.data();
2582 		}
2583 	}
2584 
2585 #endif // CTS_USES_VULKANSC
2586 
2587 	const VkDeviceCreateInfo						deviceInfo				=
2588 	{
2589 		VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,							// VkStructureType					sType;
2590 		pNext,															// const void*						pNext;
2591 		(VkDeviceCreateFlags)0,											// VkDeviceCreateFlags				flags;
2592 		1u	,															// uint32_t							queueCreateInfoCount;
2593 		&queueInfo,														// const VkDeviceQueueCreateInfo*	pQueueCreateInfos;
2594 		0u,																// uint32_t							enabledLayerCount;
2595 		DE_NULL,														// const char* const*				ppEnabledLayerNames;
2596 		deUint32(deviceExtensions.size()),								// uint32_t							enabledExtensionCount;
2597 		(deviceExtensions.empty() ? DE_NULL : &deviceExtensions[0]),	// const char* const*				ppEnabledExtensionNames;
2598 		deviceFeatures2.pNext == DE_NULL ? &deviceFeatures : DE_NULL,	// const VkPhysicalDeviceFeatures*	pEnabledFeatures;
2599 	};
2600 
2601 	m_logicalDevice		= createCustomDevice(m_context.getTestContext().getCommandLine().isValidationEnabled(), m_context.getPlatformInterface(), m_deviceGroupInstance, instance, deviceGroupInfo.pPhysicalDevices[physDeviceIdx], &deviceInfo);
2602 #ifndef CTS_USES_VULKANSC
2603 	m_deviceDriver = de::MovePtr<DeviceDriver>(new DeviceDriver(m_context.getPlatformInterface(), m_deviceGroupInstance, *m_logicalDevice, m_context.getUsedApiVersion()));
2604 #else
2605 	m_deviceDriver = de::MovePtr<DeviceDriverSC, DeinitDeviceDeleter>(new DeviceDriverSC(m_context.getPlatformInterface(), m_context.getInstance(), *m_logicalDevice, m_context.getTestContext().getCommandLine(), m_context.getResourceInterface(), m_context.getDeviceVulkanSC10Properties(), m_context.getDeviceProperties(), m_context.getUsedApiVersion()), vk::DeinitDeviceDeleter(m_context.getResourceInterface().get(), *m_logicalDevice));
2606 #endif // CTS_USES_VULKANSC
2607 }
2608 
2609 class DispatchBaseTest : public vkt::TestCase
2610 {
2611 public:
2612 						DispatchBaseTest	(tcu::TestContext&	testCtx,
2613 											const std::string&	name,
2614 											const deUint32		numValues,
2615 											const tcu::IVec3&	localsize,
2616 											const tcu::IVec3&	worksize,
2617 											const tcu::IVec3&	splitsize,
2618 											const vk::ComputePipelineConstructionType computePipelineConstructionType,
2619 											const bool			useMaintenance5);
2620 
2621 	virtual void		checkSupport		(Context& context) const;
2622 	void				initPrograms		(SourceCollections& sourceCollections) const;
2623 	TestInstance*		createInstance		(Context&			context) const;
2624 
2625 private:
2626 	const deUint32						m_numValues;
2627 	const tcu::IVec3					m_localSize;
2628 	const tcu::IVec3					m_workSize;
2629 	const tcu::IVec3					m_splitSize;
2630 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
2631 	const bool							m_useMaintenance5;
2632 };
2633 
2634 class DispatchBaseTestInstance : public ComputeTestInstance
2635 {
2636 public:
2637 									DispatchBaseTestInstance	(Context&			context,
2638 																const deUint32		numValues,
2639 																const tcu::IVec3&	localsize,
2640 																const tcu::IVec3&	worksize,
2641 																const tcu::IVec3&	splitsize,
2642 																const vk::ComputePipelineConstructionType computePipelineConstructionType,
2643 																const bool			useMaintenance5);
2644 
2645 	bool							isInputVectorValid			(const tcu::IVec3& small, const tcu::IVec3& big);
2646 	tcu::TestStatus					iterate						(void);
2647 
2648 private:
2649 	const deUint32						m_numValues;
2650 	const tcu::IVec3					m_localSize;
2651 	const tcu::IVec3					m_workSize;
2652 	const tcu::IVec3					m_splitWorkSize;
2653 	const bool							m_useMaintenance5;
2654 };
2655 
DispatchBaseTest(tcu::TestContext & testCtx,const std::string & name,const deUint32 numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize,const tcu::IVec3 & splitsize,const vk::ComputePipelineConstructionType computePipelineConstructionType,const bool useMaintenance5)2656 DispatchBaseTest::DispatchBaseTest (tcu::TestContext&	testCtx,
2657 									const std::string&	name,
2658 									const deUint32		numValues,
2659 									const tcu::IVec3&	localsize,
2660 									const tcu::IVec3&	worksize,
2661 									const tcu::IVec3&	splitsize,
2662 									const vk::ComputePipelineConstructionType computePipelineConstructionType,
2663 									const bool			useMaintenance5)
2664 	: TestCase		(testCtx, name)
2665 	, m_numValues	(numValues)
2666 	, m_localSize	(localsize)
2667 	, m_workSize	(worksize)
2668 	, m_splitSize	(splitsize)
2669 	, m_computePipelineConstructionType(computePipelineConstructionType)
2670 	, m_useMaintenance5	(useMaintenance5)
2671 {
2672 }
2673 
checkSupport(Context & context) const2674 void DispatchBaseTest::checkSupport (Context& context) const
2675 {
2676 	checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(), m_computePipelineConstructionType);
2677 	if (m_useMaintenance5)
2678 		context.requireDeviceFunctionality("VK_KHR_maintenance5");
2679 }
2680 
initPrograms(SourceCollections & sourceCollections) const2681 void DispatchBaseTest::initPrograms (SourceCollections& sourceCollections) const
2682 {
2683 	std::ostringstream src;
2684 	src << "#version 310 es\n"
2685 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
2686 
2687 		<< "layout(binding = 0) buffer InOut {\n"
2688 		<< "    uint values[" << de::toString(m_numValues) << "];\n"
2689 		<< "} sb_inout;\n"
2690 
2691 		<< "layout(binding = 1) readonly uniform uniformInput {\n"
2692 		<< "    uvec3 gridSize;\n"
2693 		<< "} ubo_in;\n"
2694 
2695 		<< "void main (void) {\n"
2696 		<< "    uvec3 size = ubo_in.gridSize * gl_WorkGroupSize;\n"
2697 		<< "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
2698 		<< "    uint index = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
2699 		<< "    uint offset = numValuesPerInv*index;\n"
2700 		<< "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
2701 		<< "        sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
2702 		<< "}\n";
2703 
2704 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
2705 }
2706 
createInstance(Context & context) const2707 TestInstance* DispatchBaseTest::createInstance (Context& context) const
2708 {
2709 	return new DispatchBaseTestInstance(context, m_numValues, m_localSize, m_workSize, m_splitSize, m_computePipelineConstructionType, m_useMaintenance5);
2710 }
2711 
DispatchBaseTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize,const tcu::IVec3 & splitsize,const vk::ComputePipelineConstructionType computePipelineConstructionType,const bool useMaintenance5)2712 DispatchBaseTestInstance::DispatchBaseTestInstance (Context& context,
2713 													const deUint32		numValues,
2714 													const tcu::IVec3&	localsize,
2715 													const tcu::IVec3&	worksize,
2716 													const tcu::IVec3&	splitsize,
2717 													const vk::ComputePipelineConstructionType computePipelineConstructionType,
2718 													const bool			useMaintenance5)
2719 
2720 	: ComputeTestInstance				(context, computePipelineConstructionType)
2721 	, m_numValues						(numValues)
2722 	, m_localSize						(localsize)
2723 	, m_workSize						(worksize)
2724 	, m_splitWorkSize					(splitsize)
2725 	, m_useMaintenance5		(useMaintenance5)
2726 {
2727 	// For easy work distribution across physical devices:
2728 	// WorkSize should be a multiple of SplitWorkSize only in the X component
2729 	if ((!isInputVectorValid(m_splitWorkSize, m_workSize)) ||
2730 		(m_workSize.x() <= m_splitWorkSize.x()) ||
2731 		(m_workSize.y() != m_splitWorkSize.y()) ||
2732 		(m_workSize.z() != m_splitWorkSize.z()))
2733 		TCU_THROW(TestError, "Invalid Input.");
2734 
2735 	// For easy work distribution within the same physical device:
2736 	// SplitWorkSize should be a multiple of localSize in Y or Z component
2737 	if ((!isInputVectorValid(m_localSize, m_splitWorkSize)) ||
2738 		(m_localSize.x() != m_splitWorkSize.x()) ||
2739 		(m_localSize.y() >= m_splitWorkSize.y()) ||
2740 		(m_localSize.z() >= m_splitWorkSize.z()))
2741 		TCU_THROW(TestError, "Invalid Input.");
2742 
2743 	if ((multiplyComponents(m_workSize) / multiplyComponents(m_splitWorkSize)) < (deInt32) m_numPhysDevices)
2744 		TCU_THROW(TestError, "Not enough work to distribute across all physical devices.");
2745 
2746 	deUint32 totalWork = multiplyComponents(m_workSize) * multiplyComponents(m_localSize);
2747 	if ((totalWork > numValues) || (numValues % totalWork != 0))
2748 		TCU_THROW(TestError, "Buffer too small/not aligned to cover all values.");
2749 }
2750 
isInputVectorValid(const tcu::IVec3 & small,const tcu::IVec3 & big)2751 bool DispatchBaseTestInstance::isInputVectorValid(const tcu::IVec3& small, const tcu::IVec3& big)
2752 {
2753 	if (((big.x() < small.x()) || (big.y() < small.y()) || (big.z() < small.z())) ||
2754 		((big.x() % small.x() != 0) || (big.y() % small.y() != 0) || (big.z() % small.z() != 0)))
2755 		return false;
2756 	return true;
2757 }
2758 
iterate(void)2759 tcu::TestStatus DispatchBaseTestInstance::iterate (void)
2760 {
2761 	const DeviceInterface&	vk					= getDeviceInterface();
2762 	const VkDevice			device				= getDevice();
2763 	const VkQueue			queue				= getDeviceQueue(vk, device, m_queueFamilyIndex, 0);
2764 	SimpleAllocator			allocator			(vk, device, getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice()));
2765 	deUint32				totalWorkloadSize	= 0;
2766 
2767 	// Create an uniform and input/output buffer
2768 	const deUint32 uniformBufSize = 3; // Pass the compute grid size
2769 	const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32) * uniformBufSize;
2770 	const BufferWithMemory uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
2771 
2772 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
2773 	const BufferWithMemory buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
2774 
2775 	// Fill the buffers with data
2776 	typedef std::vector<deUint32> data_vector_t;
2777 	data_vector_t uniformInputData(uniformBufSize);
2778 	data_vector_t inputData(m_numValues);
2779 
2780 	{
2781 		const Allocation& bufferAllocation = uniformBuffer.getAllocation();
2782 		deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2783 		uniformInputData[0] = *bufferPtr++ = m_workSize.x();
2784 		uniformInputData[1] = *bufferPtr++ = m_workSize.y();
2785 		uniformInputData[2] = *bufferPtr++ = m_workSize.z();
2786 		flushAlloc(vk, device, bufferAllocation);
2787 	}
2788 
2789 	{
2790 		de::Random rnd(0x82ce7f);
2791 		const Allocation& bufferAllocation = buffer.getAllocation();
2792 		deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2793 		for (deUint32 i = 0; i < m_numValues; ++i)
2794 			inputData[i] = *bufferPtr++ = rnd.getUint32();
2795 
2796 		flushAlloc(vk, device, bufferAllocation);
2797 	}
2798 
2799 	// Create descriptor set
2800 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2801 		DescriptorSetLayoutBuilder()
2802 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2803 		.addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2804 		.build(vk, device));
2805 
2806 	const Unique<VkDescriptorPool> descriptorPool(
2807 		DescriptorPoolBuilder()
2808 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2809 		.addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2810 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2811 
2812 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2813 
2814 	const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
2815 	const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2816 
2817 	DescriptorSetUpdateBuilder()
2818 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2819 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2820 		.update(vk, device);
2821 
2822 	ComputePipelineWrapper			pipeline(vk, device, m_computePipelineConstructionType, m_context.getBinaryCollection().get("comp"));
2823 	pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
2824 	pipeline.setPipelineCreateFlags(VK_PIPELINE_CREATE_DISPATCH_BASE);
2825 
2826 #ifndef CTS_USES_VULKANSC
2827 	if (m_useMaintenance5)
2828 	{
2829 		VkPipelineCreateFlags2CreateInfoKHR pipelineFlags2CreateInfo = initVulkanStructure();
2830 		pipelineFlags2CreateInfo.flags = VK_PIPELINE_CREATE_2_DISPATCH_BASE_BIT_KHR;
2831 		pipeline.setPipelineCreatePNext(&pipelineFlags2CreateInfo);
2832 		pipeline.setPipelineCreateFlags(0);
2833 	}
2834 #else
2835 	DE_UNREF(m_useMaintenance5);
2836 #endif // CTS_USES_VULKANSC
2837 
2838 	pipeline.buildPipeline();
2839 
2840 	const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer, 0ull, bufferSizeBytes);
2841 	const VkBufferMemoryBarrier hostUniformWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2842 
2843 	const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
2844 
2845 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, m_queueFamilyIndex));
2846 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2847 
2848 	// Start recording commands
2849 	beginCommandBuffer(vk, *cmdBuffer);
2850 
2851 	pipeline.bind(*cmdBuffer);
2852 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2853 
2854 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostUniformWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2855 
2856 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2857 
2858 	// Split the workload across all physical devices based on m_splitWorkSize.x()
2859 	for (deUint32 physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2860 	{
2861 		deUint32 baseGroupX = physDevIdx * m_splitWorkSize.x();
2862 		deUint32 baseGroupY = 0;
2863 		deUint32 baseGroupZ = 0;
2864 
2865 		// Split the workload within the physical device based on m_localSize.y() and m_localSize.z()
2866 		for (deInt32 localIdxY = 0; localIdxY < (m_splitWorkSize.y() / m_localSize.y()); localIdxY++)
2867 		{
2868 			for (deInt32 localIdxZ = 0; localIdxZ < (m_splitWorkSize.z() / m_localSize.z()); localIdxZ++)
2869 			{
2870 				deUint32 offsetX = baseGroupX;
2871 				deUint32 offsetY = baseGroupY + localIdxY * m_localSize.y();
2872 				deUint32 offsetZ = baseGroupZ + localIdxZ * m_localSize.z();
2873 
2874 				deUint32 localSizeX = (physDevIdx == (m_numPhysDevices - 1)) ? m_workSize.x() - baseGroupX : m_localSize.x();
2875 				deUint32 localSizeY = m_localSize.y();
2876 				deUint32 localSizeZ = m_localSize.z();
2877 
2878 				totalWorkloadSize += (localSizeX * localSizeY * localSizeZ);
2879 				vk.cmdDispatchBase(*cmdBuffer, offsetX, offsetY, offsetZ, localSizeX, localSizeY, localSizeZ);
2880 			}
2881 		}
2882 	}
2883 
2884 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2885 
2886 	endCommandBuffer(vk, *cmdBuffer);
2887 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2888 
2889 	if (totalWorkloadSize != deUint32(multiplyComponents(m_workSize)))
2890 		TCU_THROW(TestError, "Not covering the entire workload.");
2891 
2892 	// Validate the results
2893 	const Allocation& bufferAllocation = buffer.getAllocation();
2894 	invalidateAlloc(vk, device, bufferAllocation);
2895 	const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2896 
2897 	for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
2898 	{
2899 		const deUint32 res = bufferPtr[ndx];
2900 		const deUint32 ref = ~inputData[ndx];
2901 
2902 		if (res != ref)
2903 		{
2904 			std::ostringstream msg;
2905 			msg << "Comparison failed for InOut.values[" << ndx << "]";
2906 			return tcu::TestStatus::fail(msg.str());
2907 		}
2908 	}
2909 	return tcu::TestStatus::pass("Compute succeeded");
2910 }
2911 
2912 class DeviceIndexTest : public vkt::TestCase
2913 {
2914 public:
2915 	DeviceIndexTest		(tcu::TestContext&	testCtx,
2916 											const std::string&	name,
2917 											const deUint32		numValues,
2918 											const tcu::IVec3&	localsize,
2919 											const tcu::IVec3&	splitsize,
2920 											const vk::ComputePipelineConstructionType computePipelineConstructionType);
2921 
2922 	virtual void		checkSupport		(Context& context) const;
2923 	void				initPrograms		(SourceCollections& sourceCollections) const;
2924 	TestInstance*		createInstance		(Context&			context) const;
2925 
2926 private:
2927 	const deUint32						m_numValues;
2928 	const tcu::IVec3					m_localSize;
2929 	const tcu::IVec3					m_workSize;
2930 	const tcu::IVec3					m_splitSize;
2931 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
2932 };
2933 
2934 class DeviceIndexTestInstance : public ComputeTestInstance
2935 {
2936 public:
2937 									DeviceIndexTestInstance	(Context&			context,
2938 															 const deUint32		numValues,
2939 															 const tcu::IVec3&	localsize,
2940 															 const tcu::IVec3&	worksize,
2941 															 const vk::ComputePipelineConstructionType computePipelineConstructionType);
2942 	tcu::TestStatus					iterate					(void);
2943 private:
2944 	const deUint32						m_numValues;
2945 	const tcu::IVec3					m_localSize;
2946 	tcu::IVec3							m_workSize;
2947 };
2948 
DeviceIndexTest(tcu::TestContext & testCtx,const std::string & name,const deUint32 numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize,const vk::ComputePipelineConstructionType computePipelineConstructionType)2949 DeviceIndexTest::DeviceIndexTest (tcu::TestContext&	testCtx,
2950 									const std::string&	name,
2951 									const deUint32		numValues,
2952 									const tcu::IVec3&	localsize,
2953 									const tcu::IVec3&	worksize,
2954 									const vk::ComputePipelineConstructionType computePipelineConstructionType)
2955 	: TestCase							(testCtx, name)
2956 	, m_numValues						(numValues)
2957 	, m_localSize						(localsize)
2958 	, m_workSize						(worksize)
2959 	, m_computePipelineConstructionType	(computePipelineConstructionType)
2960 {
2961 }
2962 
checkSupport(Context & context) const2963 void DeviceIndexTest::checkSupport (Context& context) const
2964 {
2965 	checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(), m_computePipelineConstructionType);
2966 }
2967 
initPrograms(SourceCollections & sourceCollections) const2968 void DeviceIndexTest::initPrograms (SourceCollections& sourceCollections) const
2969 {
2970 	std::ostringstream src;
2971 	src << "#version 310 es\n"
2972 		<< "#extension GL_EXT_device_group : require\n"
2973 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
2974 
2975 		<< "layout(binding = 0) buffer InOut {\n"
2976 		<< "    uint values[" << de::toString(m_numValues) << "];\n"
2977 		<< "} sb_inout;\n"
2978 
2979 		<< "layout(binding = 1) readonly uniform uniformInput {\n"
2980 		<< "    uint baseOffset[1+" << VK_MAX_DEVICE_GROUP_SIZE << "];\n"
2981 		<< "} ubo_in;\n"
2982 
2983 		<< "void main (void) {\n"
2984 		<< "    uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
2985 		<< "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
2986 		<< "    uint index = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
2987 		<< "    uint offset = numValuesPerInv*index;\n"
2988 		<< "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
2989 		<< "        sb_inout.values[offset + ndx] = ubo_in.baseOffset[0] + ubo_in.baseOffset[gl_DeviceIndex + 1];\n"
2990 		<< "}\n";
2991 
2992 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
2993 }
2994 
createInstance(Context & context) const2995 TestInstance* DeviceIndexTest::createInstance (Context& context) const
2996 {
2997 	return new DeviceIndexTestInstance(context, m_numValues, m_localSize, m_workSize, m_computePipelineConstructionType);
2998 }
2999 
DeviceIndexTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize,const vk::ComputePipelineConstructionType computePipelineConstructionType)3000 DeviceIndexTestInstance::DeviceIndexTestInstance (Context& context,
3001 													const deUint32		numValues,
3002 													const tcu::IVec3&	localsize,
3003 													const tcu::IVec3&	worksize,
3004 													const vk::ComputePipelineConstructionType computePipelineConstructionType)
3005 
3006 	: ComputeTestInstance				(context, computePipelineConstructionType)
3007 	, m_numValues						(numValues)
3008 	, m_localSize						(localsize)
3009 	, m_workSize						(worksize)
3010 {}
3011 
iterate(void)3012 tcu::TestStatus DeviceIndexTestInstance::iterate (void)
3013 {
3014 	const DeviceInterface&			vk					= getDeviceInterface();
3015 	const VkDevice					device				= getDevice();
3016 	const VkQueue					queue				= getDeviceQueue(vk, device, m_queueFamilyIndex, 0);
3017 	SimpleAllocator					allocator			(vk, device, getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice()));
3018 	const deUint32					allocDeviceMask		= (1 << m_numPhysDevices) - 1;
3019 	de::Random						rnd					(0x82ce7f);
3020 	Move<VkBuffer>					sboBuffer;
3021 	vk::Move<vk::VkDeviceMemory>	sboBufferMemory;
3022 
3023 	// Create an uniform and output buffer
3024 	const deUint32 uniformBufSize = 4 * (1 + VK_MAX_DEVICE_GROUP_SIZE);
3025 	const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32) * uniformBufSize;
3026 	const BufferWithMemory uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
3027 
3028 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
3029 	const BufferWithMemory checkBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT), MemoryRequirement::HostVisible);
3030 
3031 	// create SBO buffer
3032 	{
3033 		const VkBufferCreateInfo	sboBufferParams =
3034 		{
3035 			VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,									// sType
3036 			DE_NULL,																// pNext
3037 			0u,																		// flags
3038 			(VkDeviceSize)bufferSizeBytes,											// size
3039 			VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT,	// usage
3040 			VK_SHARING_MODE_EXCLUSIVE,												// sharingMode
3041 			1u,																		// queueFamilyIndexCount
3042 			&m_queueFamilyIndex,														// pQueueFamilyIndices
3043 		};
3044 		sboBuffer = createBuffer(vk, device, &sboBufferParams);
3045 
3046 		VkMemoryRequirements memReqs = getBufferMemoryRequirements(vk, device, sboBuffer.get());
3047 		deUint32 memoryTypeNdx = 0;
3048 		const VkPhysicalDeviceMemoryProperties deviceMemProps = getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice());
3049 		for ( memoryTypeNdx = 0; memoryTypeNdx < deviceMemProps.memoryTypeCount; memoryTypeNdx++)
3050 		{
3051 			if ((memReqs.memoryTypeBits & (1u << memoryTypeNdx)) != 0 &&
3052 				(deviceMemProps.memoryTypes[memoryTypeNdx].propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) == VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)
3053 				break;
3054 		}
3055 		if (memoryTypeNdx == deviceMemProps.memoryTypeCount)
3056 			TCU_THROW(NotSupportedError, "No compatible memory type found");
3057 
3058 		const VkMemoryAllocateFlagsInfo allocDeviceMaskInfo =
3059 		{
3060 			VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO,		// sType
3061 			DE_NULL,											// pNext
3062 			VK_MEMORY_ALLOCATE_DEVICE_MASK_BIT,					// flags
3063 			allocDeviceMask,									// deviceMask
3064 		};
3065 
3066 		VkMemoryAllocateInfo		allocInfo =
3067 		{
3068 			VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,			// sType
3069 			&allocDeviceMaskInfo,							// pNext
3070 			memReqs.size,									// allocationSize
3071 			memoryTypeNdx,									// memoryTypeIndex
3072 		};
3073 
3074 		sboBufferMemory = allocateMemory(vk, device, &allocInfo);
3075 		VK_CHECK(vk.bindBufferMemory(device, *sboBuffer, sboBufferMemory.get(), 0));
3076 	}
3077 
3078 	// Fill the buffers with data
3079 	typedef std::vector<deUint32> data_vector_t;
3080 	data_vector_t uniformInputData(uniformBufSize, 0);
3081 
3082 	{
3083 		const Allocation& bufferAllocation = uniformBuffer.getAllocation();
3084 		deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
3085 		for (deUint32 i = 0; i < uniformBufSize; ++i)
3086 			uniformInputData[i] = *bufferPtr++ = rnd.getUint32() / 10; // divide to prevent overflow in addition
3087 
3088 		flushAlloc(vk, device, bufferAllocation);
3089 	}
3090 
3091 	// Create descriptor set
3092 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
3093 		DescriptorSetLayoutBuilder()
3094 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3095 		.addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3096 		.build(vk, device));
3097 
3098 	const Unique<VkDescriptorPool> descriptorPool(
3099 		DescriptorPoolBuilder()
3100 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
3101 		.addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
3102 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
3103 
3104 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
3105 
3106 	const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*sboBuffer, 0ull, bufferSizeBytes);
3107 	const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
3108 
3109 	DescriptorSetUpdateBuilder()
3110 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
3111 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
3112 		.update(vk, device);
3113 
3114 	ComputePipelineWrapper			pipeline(vk, device, m_computePipelineConstructionType, m_context.getBinaryCollection().get("comp"));
3115 	pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
3116 	pipeline.buildPipeline();
3117 
3118 	const VkBufferMemoryBarrier hostUniformWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
3119 	const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT , *sboBuffer, 0ull, bufferSizeBytes);
3120 
3121 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, m_queueFamilyIndex));
3122 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3123 
3124 	// Verify multiple device masks
3125 	for (deUint32 physDevMask = 1; physDevMask < (1u << m_numPhysDevices); physDevMask++)
3126 	{
3127 		deUint32 constantValPerLoop = 0;
3128 		{
3129 			const Allocation& bufferAllocation = uniformBuffer.getAllocation();
3130 			deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
3131 			constantValPerLoop = *bufferPtr = rnd.getUint32() / 10;  // divide to prevent overflow in addition
3132 			flushAlloc(vk, device, bufferAllocation);
3133 		}
3134 		beginCommandBuffer(vk, *cmdBuffer);
3135 
3136 		pipeline.bind(*cmdBuffer);
3137 		vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
3138 		vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostUniformWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
3139 
3140 		vk.cmdSetDeviceMask(*cmdBuffer, physDevMask);
3141 		vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
3142 
3143 		vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
3144 
3145 		endCommandBuffer(vk, *cmdBuffer);
3146 		submitCommandsAndWait(vk, device, queue, *cmdBuffer, true, physDevMask);
3147 		m_context.resetCommandPoolForVKSC(device, *cmdPool);
3148 
3149 		// Validate the results on all physical devices where compute shader was launched
3150 		const VkBufferMemoryBarrier srcBufferBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT , *sboBuffer, 0ull, bufferSizeBytes);
3151 		const VkBufferMemoryBarrier dstBufferBarrier = makeBufferMemoryBarrier(VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *checkBuffer, 0ull, bufferSizeBytes);
3152 		const VkBufferCopy	copyParams =
3153 		{
3154 			(VkDeviceSize)0u,						// srcOffset
3155 			(VkDeviceSize)0u,						// dstOffset
3156 			bufferSizeBytes							// size
3157 		};
3158 
3159 		for (deUint32 physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
3160 		{
3161 			if (!(1<<physDevIdx & physDevMask))
3162 				continue;
3163 
3164 			const deUint32 deviceMask = 1 << physDevIdx;
3165 
3166 			beginCommandBuffer(vk, *cmdBuffer);
3167 			vk.cmdSetDeviceMask(*cmdBuffer, deviceMask);
3168 			vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT , VK_PIPELINE_STAGE_TRANSFER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &srcBufferBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
3169 			vk.cmdCopyBuffer(*cmdBuffer, *sboBuffer, *checkBuffer, 1, &copyParams);
3170 			vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &dstBufferBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
3171 
3172 			endCommandBuffer(vk, *cmdBuffer);
3173 			submitCommandsAndWait(vk, device, queue, *cmdBuffer, true, deviceMask);
3174 
3175 			const Allocation& bufferAllocation = checkBuffer.getAllocation();
3176 			invalidateAlloc(vk, device, bufferAllocation);
3177 			const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
3178 
3179 			for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
3180 			{
3181 				const deUint32 res = bufferPtr[ndx];
3182 				const deUint32 ref = constantValPerLoop + uniformInputData[4 * (physDevIdx + 1)];
3183 
3184 				if (res != ref)
3185 				{
3186 					std::ostringstream msg;
3187 					msg << "Comparison failed on physical device "<< getPhysicalDevice(physDevIdx) <<" ( deviceMask "<< deviceMask <<" ) for InOut.values[" << ndx << "]";
3188 					return tcu::TestStatus::fail(msg.str());
3189 				}
3190 			}
3191 		}
3192 	}
3193 
3194 	return tcu::TestStatus::pass("Compute succeeded");
3195 }
3196 
3197 class ConcurrentCompute : public vkt::TestCase
3198 {
3199 public:
3200 						ConcurrentCompute	(tcu::TestContext&	testCtx,
3201 											 const std::string&	name,
3202 											 const vk::ComputePipelineConstructionType computePipelineConstructionType);
3203 
3204 
3205 	virtual void		checkSupport		(Context& context) const;
3206 	void				initPrograms		(SourceCollections& sourceCollections) const;
3207 	TestInstance*		createInstance		(Context&			context) const;
3208 
3209 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
3210 };
3211 
3212 class ConcurrentComputeInstance : public vkt::TestInstance
3213 {
3214 public:
3215 									ConcurrentComputeInstance	(Context& context, const vk::ComputePipelineConstructionType computePipelineConstructionType);
3216 
3217 	tcu::TestStatus					iterate						(void);
3218 private:
3219 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
3220 };
3221 
ConcurrentCompute(tcu::TestContext & testCtx,const std::string & name,const vk::ComputePipelineConstructionType computePipelineConstructionType)3222 ConcurrentCompute::ConcurrentCompute (tcu::TestContext&	testCtx,
3223 									  const std::string&	name,
3224 									  const vk::ComputePipelineConstructionType computePipelineConstructionType)
3225 	: TestCase							(testCtx, name)
3226 	, m_computePipelineConstructionType (computePipelineConstructionType)
3227 {
3228 }
3229 
checkSupport(Context & context) const3230 void ConcurrentCompute::checkSupport (Context& context) const
3231 {
3232 	checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(), m_computePipelineConstructionType);
3233 }
3234 
initPrograms(SourceCollections & sourceCollections) const3235 void ConcurrentCompute::initPrograms (SourceCollections& sourceCollections) const
3236 {
3237 	std::ostringstream src;
3238 	src << "#version 310 es\n"
3239 		<< "layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;\n"
3240 		<< "layout(binding = 0) buffer InOut {\n"
3241 		<< "    uint values[1024];\n"
3242 		<< "} sb_inout;\n"
3243 		<< "void main (void) {\n"
3244 		<< "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
3245 		<< "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
3246 		<< "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
3247 		<< "    uint offset          = numValuesPerInv*groupNdx;\n"
3248 		<< "\n"
3249 		<< "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
3250 		<< "        sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
3251 		<< "}\n";
3252 
3253 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
3254 }
3255 
createInstance(Context & context) const3256 TestInstance* ConcurrentCompute::createInstance (Context& context) const
3257 {
3258 	return new ConcurrentComputeInstance(context, m_computePipelineConstructionType);
3259 }
3260 
ConcurrentComputeInstance(Context & context,const vk::ComputePipelineConstructionType computePipelineConstructionType)3261 ConcurrentComputeInstance::ConcurrentComputeInstance (Context& context, const vk::ComputePipelineConstructionType computePipelineConstructionType)
3262 	: TestInstance						(context)
3263 	, m_computePipelineConstructionType	(computePipelineConstructionType)
3264 {
3265 }
3266 
iterate(void)3267 tcu::TestStatus ConcurrentComputeInstance::iterate (void)
3268 {
3269 	enum {
3270 		NO_MATCH_FOUND	= ~((deUint32)0),
3271 		ERROR_NONE		= 0,
3272 		ERROR_WAIT		= 1,
3273 		ERROR_ORDER		= 2
3274 	};
3275 
3276 	struct Queues
3277 	{
3278 		VkQueue		queue;
3279 		deUint32	queueFamilyIndex;
3280 	};
3281 
3282 //	const DeviceInterface&					vk							= m_context.getDeviceInterface();
3283 	const deUint32							numValues					= 1024;
3284 	const CustomInstance					instance					(createCustomInstanceFromContext(m_context));
3285 	const InstanceDriver&					instanceDriver				(instance.getDriver());
3286 	const VkPhysicalDevice					physicalDevice				= chooseDevice(instanceDriver, instance, m_context.getTestContext().getCommandLine());
3287 	tcu::TestLog&							log							= m_context.getTestContext().getLog();
3288 	vk::Move<vk::VkDevice>					logicalDevice;
3289 	std::vector<VkQueueFamilyProperties>	queueFamilyProperties;
3290 	VkDeviceCreateInfo						deviceInfo;
3291 	VkPhysicalDeviceFeatures2				deviceFeatures2				= initVulkanStructure();
3292 	VkPhysicalDeviceFeatures				deviceFeatures;
3293 	const float								queuePriorities[2]			= {1.0f, 0.0f};
3294 	VkDeviceQueueCreateInfo					queueInfos[2];
3295 	Queues									queues[2]					=
3296 																		{
3297 																			{DE_NULL, (deUint32)NO_MATCH_FOUND},
3298 																			{DE_NULL, (deUint32)NO_MATCH_FOUND}
3299 																		};
3300 
3301 	queueFamilyProperties = getPhysicalDeviceQueueFamilyProperties(instanceDriver, physicalDevice);
3302 
3303 	for (deUint32 queueNdx = 0; queueNdx < queueFamilyProperties.size(); ++queueNdx)
3304 	{
3305 		if (queueFamilyProperties[queueNdx].queueFlags & VK_QUEUE_COMPUTE_BIT)
3306 		{
3307 			if (NO_MATCH_FOUND == queues[0].queueFamilyIndex)
3308 				queues[0].queueFamilyIndex = queueNdx;
3309 
3310 			if (queues[0].queueFamilyIndex != queueNdx || queueFamilyProperties[queueNdx].queueCount > 1u)
3311 			{
3312 				queues[1].queueFamilyIndex = queueNdx;
3313 				break;
3314 			}
3315 		}
3316 	}
3317 
3318 	if (queues[0].queueFamilyIndex == NO_MATCH_FOUND || queues[1].queueFamilyIndex == NO_MATCH_FOUND)
3319 		TCU_THROW(NotSupportedError, "Queues couldn't be created");
3320 
3321 	for (int queueNdx = 0; queueNdx < 2; ++queueNdx)
3322 	{
3323 		VkDeviceQueueCreateInfo queueInfo;
3324 		deMemset(&queueInfo, 0, sizeof(queueInfo));
3325 
3326 		queueInfo.sType				= VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
3327 		queueInfo.pNext				= DE_NULL;
3328 		queueInfo.flags				= (VkDeviceQueueCreateFlags)0u;
3329 		queueInfo.queueFamilyIndex	= queues[queueNdx].queueFamilyIndex;
3330 		queueInfo.queueCount		= (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex) ? 2 : 1;
3331 		queueInfo.pQueuePriorities	= (queueInfo.queueCount == 2) ? queuePriorities : &queuePriorities[queueNdx];
3332 
3333 		queueInfos[queueNdx]		= queueInfo;
3334 
3335 		if (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex)
3336 			break;
3337 	}
3338 
3339 	void* pNext = DE_NULL;
3340 
3341 	deMemset(&deviceInfo, 0, sizeof(deviceInfo));
3342 	instanceDriver.getPhysicalDeviceFeatures(physicalDevice, &deviceFeatures);
3343 
3344 	deviceFeatures2.features = deviceFeatures;
3345 
3346 	std::vector<const char*> deviceExtensions;
3347 
3348 #ifndef CTS_USES_VULKANSC
3349 	VkPhysicalDeviceDynamicRenderingFeaturesKHR		dynamicRenderingFeatures = initVulkanStructure();
3350 	dynamicRenderingFeatures.dynamicRendering = VK_TRUE;
3351 	VkPhysicalDeviceShaderObjectFeaturesEXT			shaderObjectFeatures = initVulkanStructure(&dynamicRenderingFeatures);
3352 	shaderObjectFeatures.shaderObject = VK_TRUE;
3353 
3354 	if (m_computePipelineConstructionType != COMPUTE_PIPELINE_CONSTRUCTION_TYPE_PIPELINE)
3355 	{
3356 		deviceExtensions.push_back("VK_EXT_shader_object");
3357 		deviceFeatures2.pNext = &shaderObjectFeatures;
3358 		pNext = &deviceFeatures2;
3359 	}
3360 #endif
3361 
3362 #ifdef CTS_USES_VULKANSC
3363 	VkDeviceObjectReservationCreateInfo memReservationInfo	= m_context.getTestContext().getCommandLine().isSubProcess() ? m_context.getResourceInterface()->getStatMax() : resetDeviceObjectReservationCreateInfo();
3364 	memReservationInfo.pNext								= pNext;
3365 	pNext													= &memReservationInfo;
3366 
3367 	VkPhysicalDeviceVulkanSC10Features sc10Features			= createDefaultSC10Features();
3368 	sc10Features.pNext										= pNext;
3369 	pNext													= &sc10Features;
3370 
3371 	VkPipelineCacheCreateInfo			pcCI;
3372 	std::vector<VkPipelinePoolSize>		poolSizes;
3373 	if (m_context.getTestContext().getCommandLine().isSubProcess())
3374 	{
3375 		if (m_context.getResourceInterface()->getCacheDataSize() > 0)
3376 		{
3377 			pcCI =
3378 			{
3379 				VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO,			// VkStructureType				sType;
3380 				DE_NULL,												// const void*					pNext;
3381 				VK_PIPELINE_CACHE_CREATE_READ_ONLY_BIT |
3382 					VK_PIPELINE_CACHE_CREATE_USE_APPLICATION_STORAGE_BIT,	// VkPipelineCacheCreateFlags	flags;
3383 				m_context.getResourceInterface()->getCacheDataSize(),	// deUintptr					initialDataSize;
3384 				m_context.getResourceInterface()->getCacheData()		// const void*					pInitialData;
3385 			};
3386 			memReservationInfo.pipelineCacheCreateInfoCount		= 1;
3387 			memReservationInfo.pPipelineCacheCreateInfos		= &pcCI;
3388 		}
3389 
3390 		poolSizes							= m_context.getResourceInterface()->getPipelinePoolSizes();
3391 		if (!poolSizes.empty())
3392 		{
3393 			memReservationInfo.pipelinePoolSizeCount			= deUint32(poolSizes.size());
3394 			memReservationInfo.pPipelinePoolSizes				= poolSizes.data();
3395 		}
3396 	}
3397 #endif // CTS_USES_VULKANSC
3398 
3399 	deviceInfo.sType					= VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
3400 	deviceInfo.pNext					= pNext;
3401 	deviceInfo.enabledExtensionCount	= (deUint32)deviceExtensions.size();
3402 	deviceInfo.ppEnabledExtensionNames	= deviceExtensions.data();
3403 	deviceInfo.enabledLayerCount		= 0u;
3404 	deviceInfo.ppEnabledLayerNames		= DE_NULL;
3405 	deviceInfo.pEnabledFeatures			= (deviceFeatures2.pNext == DE_NULL) ? &deviceFeatures : DE_NULL;
3406 	deviceInfo.queueCreateInfoCount		= (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex) ? 1 : 2;
3407 	deviceInfo.pQueueCreateInfos		= queueInfos;
3408 
3409 	logicalDevice = createCustomDevice	(m_context.getTestContext().getCommandLine().isValidationEnabled(), m_context.getPlatformInterface(), instance, instanceDriver, physicalDevice, &deviceInfo);
3410 
3411 #ifndef CTS_USES_VULKANSC
3412 	de::MovePtr<vk::DeviceDriver>	deviceDriver = de::MovePtr<DeviceDriver>(new DeviceDriver(m_context.getPlatformInterface(), instance, *logicalDevice, m_context.getUsedApiVersion()));
3413 #else
3414 	de::MovePtr<vk::DeviceDriverSC, vk::DeinitDeviceDeleter>	deviceDriver = de::MovePtr<DeviceDriverSC, DeinitDeviceDeleter>(new DeviceDriverSC(m_context.getPlatformInterface(), instance, *logicalDevice, m_context.getTestContext().getCommandLine(), m_context.getResourceInterface(), m_context.getDeviceVulkanSC10Properties(), m_context.getDeviceProperties(), m_context.getUsedApiVersion()), vk::DeinitDeviceDeleter(m_context.getResourceInterface().get(), *logicalDevice));
3415 #endif // CTS_USES_VULKANSC
3416 	vk::DeviceInterface& vk = *deviceDriver;
3417 
3418 	for (deUint32 queueReqNdx = 0; queueReqNdx < 2; ++queueReqNdx)
3419 	{
3420 		if (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex)
3421 			vk.getDeviceQueue(*logicalDevice, queues[queueReqNdx].queueFamilyIndex, queueReqNdx, &queues[queueReqNdx].queue);
3422 		else
3423 			vk.getDeviceQueue(*logicalDevice, queues[queueReqNdx].queueFamilyIndex, 0u, &queues[queueReqNdx].queue);
3424 	}
3425 
3426 	// Create an input/output buffers
3427 	const VkPhysicalDeviceMemoryProperties memoryProperties	= vk::getPhysicalDeviceMemoryProperties(instanceDriver, physicalDevice);
3428 
3429 	de::MovePtr<SimpleAllocator> allocator					= de::MovePtr<SimpleAllocator>(new SimpleAllocator(vk, *logicalDevice, memoryProperties));
3430 	const VkDeviceSize bufferSizeBytes						= sizeof(deUint32) * numValues;
3431 	const BufferWithMemory buffer1(vk, *logicalDevice, *allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
3432 	const BufferWithMemory buffer2(vk, *logicalDevice, *allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
3433 
3434 	// Fill the buffers with data
3435 
3436 	typedef std::vector<deUint32> data_vector_t;
3437 	data_vector_t inputData(numValues);
3438 
3439 	{
3440 		de::Random rnd(0x82ce7f);
3441 		const Allocation& bufferAllocation1	= buffer1.getAllocation();
3442 		const Allocation& bufferAllocation2	= buffer2.getAllocation();
3443 		deUint32* bufferPtr1				= static_cast<deUint32*>(bufferAllocation1.getHostPtr());
3444 		deUint32* bufferPtr2				= static_cast<deUint32*>(bufferAllocation2.getHostPtr());
3445 
3446 		for (deUint32 i = 0; i < numValues; ++i)
3447 		{
3448 			deUint32 val = rnd.getUint32();
3449 			inputData[i] = val;
3450 			*bufferPtr1++ = val;
3451 			*bufferPtr2++ = val;
3452 		}
3453 
3454 		flushAlloc(vk, *logicalDevice, bufferAllocation1);
3455 		flushAlloc(vk, *logicalDevice, bufferAllocation2);
3456 	}
3457 
3458 	// Create descriptor sets
3459 
3460 	const Unique<VkDescriptorSetLayout>	descriptorSetLayout1(
3461 		DescriptorSetLayoutBuilder()
3462 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3463 		.build(vk, *logicalDevice));
3464 
3465 	const Unique<VkDescriptorPool>		descriptorPool1(
3466 		DescriptorPoolBuilder()
3467 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
3468 		.build(vk, *logicalDevice, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
3469 
3470 	const Unique<VkDescriptorSet>		descriptorSet1(makeDescriptorSet(vk, *logicalDevice, *descriptorPool1, *descriptorSetLayout1));
3471 
3472 	const VkDescriptorBufferInfo		bufferDescriptorInfo1	= makeDescriptorBufferInfo(*buffer1, 0ull, bufferSizeBytes);
3473 		DescriptorSetUpdateBuilder()
3474 		.writeSingle(*descriptorSet1, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo1)
3475 		.update(vk, *logicalDevice);
3476 
3477 	const Unique<VkDescriptorSetLayout>	descriptorSetLayout2(
3478 		DescriptorSetLayoutBuilder()
3479 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3480 		.build(vk, *logicalDevice));
3481 
3482 	const Unique<VkDescriptorPool>		descriptorPool2(
3483 		DescriptorPoolBuilder()
3484 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
3485 		.build(vk, *logicalDevice, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
3486 
3487 	const Unique<VkDescriptorSet>		descriptorSet2(makeDescriptorSet(vk, *logicalDevice, *descriptorPool2, *descriptorSetLayout2));
3488 
3489 	const VkDescriptorBufferInfo		bufferDescriptorInfo2	= makeDescriptorBufferInfo(*buffer2, 0ull, bufferSizeBytes);
3490 		DescriptorSetUpdateBuilder()
3491 		.writeSingle(*descriptorSet2, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo2)
3492 		.update(vk, *logicalDevice);
3493 
3494 	// Perform the computation
3495 
3496 	const Unique<VkShaderModule>		shaderModule(createShaderModule(vk, *logicalDevice, m_context.getBinaryCollection().get("comp"), 0u));
3497 
3498 	ComputePipelineWrapper				pipeline1(vk, *logicalDevice, m_computePipelineConstructionType, m_context.getBinaryCollection().get("comp"));
3499 	pipeline1.setDescriptorSetLayout(*descriptorSetLayout1);
3500 	pipeline1.buildPipeline();
3501 	const VkBufferMemoryBarrier			hostWriteBarrier1		= makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer1, 0ull, bufferSizeBytes);
3502 	const VkBufferMemoryBarrier			shaderWriteBarrier1		= makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer1, 0ull, bufferSizeBytes);
3503 	const Unique<VkCommandPool>			cmdPool1(makeCommandPool(vk, *logicalDevice, queues[0].queueFamilyIndex));
3504 	const Unique<VkCommandBuffer>		cmdBuffer1(allocateCommandBuffer(vk, *logicalDevice, *cmdPool1, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3505 
3506 	ComputePipelineWrapper				pipeline2(vk, *logicalDevice, m_computePipelineConstructionType, m_context.getBinaryCollection().get("comp"));
3507 	pipeline2.setDescriptorSetLayout(*descriptorSetLayout2);
3508 	pipeline2.buildPipeline();
3509 	const VkBufferMemoryBarrier			hostWriteBarrier2		= makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer2, 0ull, bufferSizeBytes);
3510 	const VkBufferMemoryBarrier			shaderWriteBarrier2		= makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer2, 0ull, bufferSizeBytes);
3511 	const Unique<VkCommandPool>			cmdPool2(makeCommandPool(vk, *logicalDevice, queues[1].queueFamilyIndex));
3512 	const Unique<VkCommandBuffer>		cmdBuffer2(allocateCommandBuffer(vk, *logicalDevice, *cmdPool2, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3513 
3514 	// Command buffer 1
3515 
3516 	beginCommandBuffer(vk, *cmdBuffer1);
3517 	pipeline1.bind(*cmdBuffer1);
3518 	vk.cmdBindDescriptorSets(*cmdBuffer1, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline1.getPipelineLayout(), 0u, 1u, &descriptorSet1.get(), 0u, DE_NULL);
3519 	vk.cmdPipelineBarrier(*cmdBuffer1, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier1, 0, (const VkImageMemoryBarrier*)DE_NULL);
3520 	vk.cmdDispatch(*cmdBuffer1, 1, 1, 1);
3521 	vk.cmdPipelineBarrier(*cmdBuffer1, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier1, 0, (const VkImageMemoryBarrier*)DE_NULL);
3522 	endCommandBuffer(vk, *cmdBuffer1);
3523 
3524 	// Command buffer 2
3525 
3526 	beginCommandBuffer(vk, *cmdBuffer2);
3527 	pipeline2.bind(*cmdBuffer2);
3528 	vk.cmdBindDescriptorSets(*cmdBuffer2, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline2.getPipelineLayout(), 0u, 1u, &descriptorSet2.get(), 0u, DE_NULL);
3529 	vk.cmdPipelineBarrier(*cmdBuffer2, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier2, 0, (const VkImageMemoryBarrier*)DE_NULL);
3530 	vk.cmdDispatch(*cmdBuffer2, 1, 1, 1);
3531 	vk.cmdPipelineBarrier(*cmdBuffer2, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier2, 0, (const VkImageMemoryBarrier*)DE_NULL);
3532 	endCommandBuffer(vk, *cmdBuffer2);
3533 
3534 	VkSubmitInfo	submitInfo1 =
3535 	{
3536 		VK_STRUCTURE_TYPE_SUBMIT_INFO,			// sType
3537 		DE_NULL,								// pNext
3538 		0u,										// waitSemaphoreCount
3539 		DE_NULL,								// pWaitSemaphores
3540 		(const VkPipelineStageFlags*)DE_NULL,	// pWaitDstStageMask
3541 		1u,										// commandBufferCount
3542 		&cmdBuffer1.get(),						// pCommandBuffers
3543 		0u,										// signalSemaphoreCount
3544 		DE_NULL									// pSignalSemaphores
3545 	};
3546 
3547 	VkSubmitInfo	submitInfo2 =
3548 	{
3549 		VK_STRUCTURE_TYPE_SUBMIT_INFO,			// sType
3550 		DE_NULL,								// pNext
3551 		0u,										// waitSemaphoreCount
3552 		DE_NULL,								// pWaitSemaphores
3553 		(const VkPipelineStageFlags*)DE_NULL,	// pWaitDstStageMask
3554 		1u,										// commandBufferCount
3555 		&cmdBuffer2.get(),						// pCommandBuffers
3556 		0u,										// signalSemaphoreCount
3557 		DE_NULL									// pSignalSemaphores
3558 	};
3559 
3560 	// Wait for completion
3561 	const Unique<VkFence>	fence1(createFence(vk, *logicalDevice));
3562 	const Unique<VkFence>	fence2(createFence(vk, *logicalDevice));
3563 
3564 	VK_CHECK(vk.queueSubmit(queues[0].queue, 1u, &submitInfo1, *fence1));
3565 	VK_CHECK(vk.queueSubmit(queues[1].queue, 1u, &submitInfo2, *fence2));
3566 
3567 	int err = ERROR_NONE;
3568 
3569 	// First wait for the low-priority queue
3570 	if (VK_SUCCESS != vk.waitForFences(*logicalDevice, 1u, &fence2.get(), DE_TRUE, ~0ull))
3571 		err = ERROR_WAIT;
3572 
3573 	// If the high-priority queue hasn't finished, we have a problem.
3574 	if (VK_SUCCESS != vk.getFenceStatus(*logicalDevice, fence1.get()))
3575 		if (err == ERROR_NONE)
3576 			err = ERROR_ORDER;
3577 
3578 	// Wait for the high-priority fence so we don't get errors on teardown.
3579 	vk.waitForFences(*logicalDevice, 1u, &fence1.get(), DE_TRUE, ~0ull);
3580 
3581 	// If we fail() before waiting for all of the fences, error will come from
3582 	// teardown instead of the error we want.
3583 
3584 	if (err == ERROR_WAIT)
3585 	{
3586 		return tcu::TestStatus::fail("Failed waiting for low-priority queue fence.");
3587 	}
3588 
3589 	// Validate the results
3590 
3591 	const Allocation& bufferAllocation1	= buffer1.getAllocation();
3592 	invalidateAlloc(vk, *logicalDevice, bufferAllocation1);
3593 	const deUint32* bufferPtr1			= static_cast<deUint32*>(bufferAllocation1.getHostPtr());
3594 
3595 	const Allocation& bufferAllocation2	= buffer2.getAllocation();
3596 	invalidateAlloc(vk, *logicalDevice, bufferAllocation2);
3597 	const deUint32* bufferPtr2			= static_cast<deUint32*>(bufferAllocation2.getHostPtr());
3598 
3599 	for (deUint32 ndx = 0; ndx < numValues; ++ndx)
3600 	{
3601 		const deUint32 res1	= bufferPtr1[ndx];
3602 		const deUint32 res2	= bufferPtr2[ndx];
3603 		const deUint32 inp	= inputData[ndx];
3604 		const deUint32 ref	= ~inp;
3605 
3606 		if (res1 != ref || res1 != res2)
3607 		{
3608 			std::ostringstream msg;
3609 			msg << "Comparison failed for InOut.values[" << ndx << "] ref:" << ref <<" res1:" << res1 << " res2:" << res2 << " inp:" << inp;
3610 			return tcu::TestStatus::fail(msg.str());
3611 		}
3612 	}
3613 
3614 	if (err == ERROR_ORDER)
3615 		log << tcu::TestLog::Message << "Note: Low-priority queue was faster than high-priority one. This is not an error, but priorities may be inverted." << tcu::TestLog::EndMessage;
3616 
3617 	return tcu::TestStatus::pass("Test passed");
3618 }
3619 
3620 class EmptyWorkGroupCase : public vkt::TestCase
3621 {
3622 public:
3623 					EmptyWorkGroupCase		(tcu::TestContext& testCtx, const std::string& name, const tcu::UVec3& dispatchSize, const vk::ComputePipelineConstructionType computePipelineConstructionType);
~EmptyWorkGroupCase(void)3624 	virtual			~EmptyWorkGroupCase		(void) {}
3625 
3626 	virtual void	checkSupport			(Context& context) const override;
3627 	TestInstance*	createInstance			(Context& context) const override;
3628 	void			initPrograms			(vk::SourceCollections& programCollection) const override;
3629 
3630 protected:
3631 	const tcu::UVec3 m_dispatchSize;
3632 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
3633 };
3634 
3635 class EmptyWorkGroupInstance : public vkt::TestInstance
3636 {
3637 public:
EmptyWorkGroupInstance(Context & context,const tcu::UVec3 & dispatchSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)3638 						EmptyWorkGroupInstance	(Context& context, const tcu::UVec3& dispatchSize, const vk::ComputePipelineConstructionType computePipelineConstructionType)
3639 							: vkt::TestInstance					(context)
3640 							, m_dispatchSize					(dispatchSize)
3641 							, m_computePipelineConstructionType	(computePipelineConstructionType)
3642 							{}
~EmptyWorkGroupInstance(void)3643 	virtual				~EmptyWorkGroupInstance	(void) {}
3644 
3645 	tcu::TestStatus		iterate					(void) override;
3646 
3647 protected:
3648 	const tcu::UVec3 m_dispatchSize;
3649 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
3650 };
3651 
EmptyWorkGroupCase(tcu::TestContext & testCtx,const std::string & name,const tcu::UVec3 & dispatchSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)3652 EmptyWorkGroupCase::EmptyWorkGroupCase (tcu::TestContext& testCtx, const std::string& name, const tcu::UVec3& dispatchSize, const vk::ComputePipelineConstructionType computePipelineConstructionType)
3653 	: vkt::TestCase						(testCtx, name)
3654 	, m_dispatchSize					(dispatchSize)
3655 	, m_computePipelineConstructionType	(computePipelineConstructionType)
3656 {
3657 	DE_ASSERT(m_dispatchSize.x() == 0u || m_dispatchSize.y() == 0u || m_dispatchSize.z() == 0u);
3658 }
3659 
checkSupport(Context & context) const3660 void EmptyWorkGroupCase::checkSupport (Context& context) const
3661 {
3662 	checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(), m_computePipelineConstructionType);
3663 }
3664 
createInstance(Context & context) const3665 TestInstance* EmptyWorkGroupCase::createInstance (Context& context) const
3666 {
3667 	return new EmptyWorkGroupInstance(context, m_dispatchSize, m_computePipelineConstructionType);
3668 }
3669 
initPrograms(vk::SourceCollections & programCollection) const3670 void EmptyWorkGroupCase::initPrograms (vk::SourceCollections& programCollection) const
3671 {
3672 	std::ostringstream comp;
3673 	comp
3674 		<< "#version 450\n"
3675 		<< "layout (local_size_x=1, local_size_y=1, local_size_z=1) in;\n"
3676 		<< "layout (set=0, binding=0) buffer VerificationBlock { uint value; } verif;\n"
3677 		<< "void main () { atomicAdd(verif.value, 1u); }\n"
3678 		;
3679 	programCollection.glslSources.add("comp") << glu::ComputeSource(comp.str());
3680 }
3681 
iterate(void)3682 tcu::TestStatus EmptyWorkGroupInstance::iterate (void)
3683 {
3684 	const auto&		vkd				= m_context.getDeviceInterface();
3685 	const auto		device			= m_context.getDevice();
3686 	auto&			alloc			= m_context.getDefaultAllocator();
3687 	const auto		queueIndex		= m_context.getUniversalQueueFamilyIndex();
3688 	const auto		queue			= m_context.getUniversalQueue();
3689 
3690 	const auto			verifBufferSize		= static_cast<VkDeviceSize>(sizeof(uint32_t));
3691 	const auto			verifBufferInfo		= makeBufferCreateInfo(verifBufferSize, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
3692 	BufferWithMemory	verifBuffer			(vkd, device, alloc, verifBufferInfo, MemoryRequirement::HostVisible);
3693 	auto&				verifBufferAlloc	= verifBuffer.getAllocation();
3694 	void*				verifBufferPtr		= verifBufferAlloc.getHostPtr();
3695 
3696 	deMemset(verifBufferPtr, 0, static_cast<size_t>(verifBufferSize));
3697 	flushAlloc(vkd, device, verifBufferAlloc);
3698 
3699 	DescriptorSetLayoutBuilder layoutBuilder;
3700 	layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT);
3701 	const auto descriptorSetLayout = layoutBuilder.build(vkd, device);
3702 
3703 	ComputePipelineWrapper			pipeline(vkd, device, m_computePipelineConstructionType, m_context.getBinaryCollection().get("comp"));
3704 	pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
3705 	pipeline.buildPipeline();
3706 
3707 	DescriptorPoolBuilder poolBuilder;
3708 	poolBuilder.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
3709 	const auto descriptorPool	= poolBuilder.build(vkd, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u);
3710 	const auto descriptorSet	= makeDescriptorSet(vkd, device, descriptorPool.get(), descriptorSetLayout.get());
3711 
3712 	DescriptorSetUpdateBuilder updateBuilder;
3713 	const auto verifBufferDescInfo = makeDescriptorBufferInfo(verifBuffer.get(), 0ull, verifBufferSize);
3714 	updateBuilder.writeSingle(descriptorSet.get(), DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &verifBufferDescInfo);
3715 	updateBuilder.update(vkd, device);
3716 
3717 	const auto cmdPool = makeCommandPool(vkd, device, queueIndex);
3718 	const auto cmdBufferPtr = allocateCommandBuffer(vkd, device, cmdPool.get(), VK_COMMAND_BUFFER_LEVEL_PRIMARY);
3719 	const auto cmdBuffer = cmdBufferPtr.get();
3720 
3721 	beginCommandBuffer(vkd, cmdBuffer);
3722 	pipeline.bind(cmdBuffer);
3723 	vkd.cmdBindDescriptorSets(cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u, &descriptorSet.get(), 0u, nullptr);
3724 	vkd.cmdDispatch(cmdBuffer, m_dispatchSize.x(), m_dispatchSize.y(), m_dispatchSize.z());
3725 
3726 	const auto readWriteAccess	= (VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_SHADER_READ_BIT);
3727 	const auto computeToCompute = makeMemoryBarrier(readWriteAccess, readWriteAccess);
3728 	vkd.cmdPipelineBarrier(cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0U, 1u, &computeToCompute, 0u, nullptr, 0u, nullptr);
3729 
3730 	vkd.cmdDispatch(cmdBuffer, 1u, 1u, 1u);
3731 
3732 	const auto computeToHost = makeMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT);
3733 	vkd.cmdPipelineBarrier(cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0u, 1u, &computeToHost, 0u, nullptr, 0u, nullptr);
3734 
3735 	endCommandBuffer(vkd, cmdBuffer);
3736 	submitCommandsAndWait(vkd, device, queue, cmdBuffer);
3737 
3738 	uint32_t value;
3739 	invalidateAlloc(vkd, device, verifBufferAlloc);
3740 	deMemcpy(&value, verifBufferPtr, sizeof(value));
3741 
3742 	if (value != 1u)
3743 	{
3744 		std::ostringstream msg;
3745 		msg << "Unexpected value found in buffer: " << value << " while expecting 1";
3746 		TCU_FAIL(msg.str());
3747 	}
3748 
3749 	return tcu::TestStatus::pass("Pass");
3750 }
3751 
3752 class MaxWorkGroupSizeTest : public vkt::TestCase
3753 {
3754 public:
3755 	enum class Axis	{ X = 0, Y = 1, Z = 2 };
3756 
3757 	struct Params
3758 	{
3759 		// Which axis to maximize.
3760 		Axis axis;
3761 	};
3762 
3763 							MaxWorkGroupSizeTest	(tcu::TestContext& testCtx, const std::string& name, const Params& params, const vk::ComputePipelineConstructionType computePipelineConstructionType);
~MaxWorkGroupSizeTest(void)3764 	virtual					~MaxWorkGroupSizeTest	(void) {}
3765 
3766 	virtual void			initPrograms			(vk::SourceCollections& programCollection) const;
3767 	virtual TestInstance*	createInstance			(Context& context) const;
3768 	virtual void			checkSupport			(Context& context) const;
3769 
3770 	// Helper to transform the axis value to an index.
3771 	static int				getIndex				(Axis axis);
3772 
3773 	// Helper returning the number of invocations according to the test parameters.
3774 	static deUint32			getInvocations			(const Params& params, const vk::InstanceInterface& vki, vk::VkPhysicalDevice physicalDevice, const vk::VkPhysicalDeviceProperties* devProperties = nullptr);
3775 
3776 	// Helper returning the buffer size needed to this test.
3777 	static deUint32			getSSBOSize				(deUint32 invocations);
3778 
3779 private:
3780 	Params m_params;
3781 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
3782 };
3783 
3784 class MaxWorkGroupSizeInstance : public vkt::TestInstance
3785 {
3786 public:
3787 								MaxWorkGroupSizeInstance	(Context& context, const MaxWorkGroupSizeTest::Params& params, const vk::ComputePipelineConstructionType computePipelineConstructionType);
~MaxWorkGroupSizeInstance(void)3788 	virtual						~MaxWorkGroupSizeInstance	(void) {}
3789 
3790 	virtual tcu::TestStatus		iterate			(void);
3791 
3792 private:
3793 	MaxWorkGroupSizeTest::Params		m_params;
3794 	vk::ComputePipelineConstructionType m_computePipelineConstructionType;
3795 };
3796 
getIndex(Axis axis)3797 int MaxWorkGroupSizeTest::getIndex (Axis axis)
3798 {
3799 	const int ret = static_cast<int>(axis);
3800 	DE_ASSERT(ret >= static_cast<int>(Axis::X) && ret <= static_cast<int>(Axis::Z));
3801 	return ret;
3802 }
3803 
getInvocations(const Params & params,const vk::InstanceInterface & vki,vk::VkPhysicalDevice physicalDevice,const vk::VkPhysicalDeviceProperties * devProperties)3804 deUint32 MaxWorkGroupSizeTest::getInvocations (const Params& params, const vk::InstanceInterface& vki, vk::VkPhysicalDevice physicalDevice, const vk::VkPhysicalDeviceProperties* devProperties)
3805 {
3806 	const auto axis = getIndex(params.axis);
3807 
3808 	if (devProperties)
3809 		return devProperties->limits.maxComputeWorkGroupSize[axis];
3810 	return vk::getPhysicalDeviceProperties(vki, physicalDevice).limits.maxComputeWorkGroupSize[axis];
3811 }
3812 
getSSBOSize(deUint32 invocations)3813 deUint32 MaxWorkGroupSizeTest::getSSBOSize (deUint32 invocations)
3814 {
3815 	return invocations * static_cast<deUint32>(sizeof(deUint32));
3816 }
3817 
MaxWorkGroupSizeTest(tcu::TestContext & testCtx,const std::string & name,const Params & params,const vk::ComputePipelineConstructionType computePipelineConstructionType)3818 MaxWorkGroupSizeTest::MaxWorkGroupSizeTest (tcu::TestContext& testCtx, const std::string& name, const Params& params, const vk::ComputePipelineConstructionType computePipelineConstructionType)
3819 	: vkt::TestCase						(testCtx, name)
3820 	, m_params							(params)
3821 	, m_computePipelineConstructionType	(computePipelineConstructionType)
3822 {}
3823 
initPrograms(vk::SourceCollections & programCollection) const3824 void MaxWorkGroupSizeTest::initPrograms (vk::SourceCollections& programCollection) const
3825 {
3826 	std::ostringstream shader;
3827 
3828 	// The actual local sizes will be set using spec constants when running the test instance.
3829 	shader
3830 		<< "#version 450\n"
3831 		<< "\n"
3832 		<< "layout(constant_id=0) const int local_size_x_val = 1;\n"
3833 		<< "layout(constant_id=1) const int local_size_y_val = 1;\n"
3834 		<< "layout(constant_id=2) const int local_size_z_val = 1;\n"
3835 		<< "\n"
3836 		<< "layout(local_size_x_id=0, local_size_y_id=1, local_size_z_id=2) in;\n"
3837 		<< "\n"
3838 		<< "layout(set=0, binding=0) buffer StorageBuffer {\n"
3839 		<< "    uint values[];\n"
3840 		<< "} ssbo;\n"
3841 		<< "\n"
3842 		<< "void main() {\n"
3843 		<< "    ssbo.values[gl_LocalInvocationIndex] = 1u;\n"
3844 		<< "}\n"
3845 		;
3846 
3847 	programCollection.glslSources.add("comp") << glu::ComputeSource(shader.str());
3848 }
3849 
createInstance(Context & context) const3850 TestInstance* MaxWorkGroupSizeTest::createInstance (Context& context) const
3851 {
3852 	return new MaxWorkGroupSizeInstance(context, m_params, m_computePipelineConstructionType);
3853 }
3854 
checkSupport(Context & context) const3855 void MaxWorkGroupSizeTest::checkSupport (Context& context) const
3856 {
3857 	const auto&	vki				= context.getInstanceInterface();
3858 	const auto	physicalDevice	= context.getPhysicalDevice();
3859 
3860 	const auto	properties		= vk::getPhysicalDeviceProperties(vki, physicalDevice);
3861 	const auto	invocations		= getInvocations(m_params, vki, physicalDevice, &properties);
3862 
3863 	if (invocations > properties.limits.maxComputeWorkGroupInvocations)
3864 		TCU_FAIL("Reported workgroup size limit in the axis is greater than the global invocation limit");
3865 
3866 	if (properties.limits.maxStorageBufferRange / static_cast<deUint32>(sizeof(deUint32)) < invocations)
3867 		TCU_THROW(NotSupportedError, "Maximum supported storage buffer range too small");
3868 
3869 	checkShaderObjectRequirements(vki, physicalDevice, m_computePipelineConstructionType);
3870 }
3871 
MaxWorkGroupSizeInstance(Context & context,const MaxWorkGroupSizeTest::Params & params,const vk::ComputePipelineConstructionType computePipelineConstructionType)3872 MaxWorkGroupSizeInstance::MaxWorkGroupSizeInstance (Context& context, const MaxWorkGroupSizeTest::Params& params, const vk::ComputePipelineConstructionType computePipelineConstructionType)
3873 	: vkt::TestInstance					(context)
3874 	, m_params							(params)
3875 	, m_computePipelineConstructionType	(computePipelineConstructionType)
3876 {}
3877 
iterate(void)3878 tcu::TestStatus MaxWorkGroupSizeInstance::iterate (void)
3879 {
3880 	const auto&	vki				= m_context.getInstanceInterface();
3881 	const auto&	vkd				= m_context.getDeviceInterface();
3882 	const auto	physicalDevice	= m_context.getPhysicalDevice();
3883 	const auto	device			= m_context.getDevice();
3884 	auto&		alloc			= m_context.getDefaultAllocator();
3885 	const auto	queueIndex		= m_context.getUniversalQueueFamilyIndex();
3886 	const auto	queue			= m_context.getUniversalQueue();
3887 	auto&		log				= m_context.getTestContext().getLog();
3888 
3889 	const auto	axis			= MaxWorkGroupSizeTest::getIndex(m_params.axis);
3890 	const auto	invocations		= MaxWorkGroupSizeTest::getInvocations(m_params, vki, physicalDevice);
3891 	const auto	ssboSize		= static_cast<vk::VkDeviceSize>(MaxWorkGroupSizeTest::getSSBOSize(invocations));
3892 
3893 	log
3894 		<< tcu::TestLog::Message
3895 		<< "Running test with " << invocations << " invocations on axis " << axis << " using a storage buffer size of " << ssboSize << " bytes"
3896 		<< tcu::TestLog::EndMessage
3897 		;
3898 
3899 	// Main SSBO buffer.
3900 	const auto				ssboInfo	= vk::makeBufferCreateInfo(ssboSize, vk::VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
3901 	vk::BufferWithMemory	ssbo		(vkd, device, alloc, ssboInfo, vk::MemoryRequirement::HostVisible);
3902 
3903 	// Descriptor set layouts.
3904 	vk::DescriptorSetLayoutBuilder layoutBuilder;
3905 	layoutBuilder.addSingleBinding(vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, vk::VK_SHADER_STAGE_COMPUTE_BIT);
3906 	const auto descriptorSetLayout = layoutBuilder.build(vkd, device);
3907 
3908 	// Specialization constants: set the number of invocations in the appropriate local size id.
3909 	const auto	entrySize				= static_cast<deUintptr>(sizeof(deInt32));
3910 	deInt32		specializationData[3]	= { 1, 1, 1 };
3911 	specializationData[axis] = static_cast<deInt32>(invocations);
3912 
3913 	const vk::VkSpecializationMapEntry specializationMaps[3] =
3914 	{
3915 		{
3916 			0u,										//	deUint32	constantID;
3917 			0u,										//	deUint32	offset;
3918 			entrySize,								//	deUintptr	size;
3919 		},
3920 		{
3921 			1u,										//	deUint32	constantID;
3922 			static_cast<deUint32>(entrySize),		//	deUint32	offset;
3923 			entrySize,								//	deUintptr	size;
3924 		},
3925 		{
3926 			2u,										//	deUint32	constantID;
3927 			static_cast<deUint32>(entrySize * 2u),	//	deUint32	offset;
3928 			entrySize,								//	deUintptr	size;
3929 		},
3930 	};
3931 
3932 	const vk::VkSpecializationInfo specializationInfo =
3933 	{
3934 		3u,													//	deUint32						mapEntryCount;
3935 		specializationMaps,									//	const VkSpecializationMapEntry*	pMapEntries;
3936 		static_cast<deUintptr>(sizeof(specializationData)),	//	deUintptr						dataSize;
3937 		specializationData,									//	const void*						pData;
3938 	};
3939 
3940 	ComputePipelineWrapper			testPipeline	(vkd, device, m_computePipelineConstructionType, m_context.getBinaryCollection().get("comp"));
3941 	testPipeline.setDescriptorSetLayout(descriptorSetLayout.get());
3942 	testPipeline.setSpecializationInfo(specializationInfo);
3943 	testPipeline.buildPipeline();
3944 
3945 	// Create descriptor pool and set.
3946 	vk::DescriptorPoolBuilder poolBuilder;
3947 	poolBuilder.addType(vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
3948 	const auto descriptorPool	= poolBuilder.build(vkd, device, vk::VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u);
3949 	const auto descriptorSet	= vk::makeDescriptorSet(vkd, device, descriptorPool.get(), descriptorSetLayout.get());
3950 
3951 	// Update descriptor set.
3952 	const vk::VkDescriptorBufferInfo ssboBufferInfo =
3953 	{
3954 		ssbo.get(),		//	VkBuffer		buffer;
3955 		0u,				//	VkDeviceSize	offset;
3956 		VK_WHOLE_SIZE,	//	VkDeviceSize	range;
3957 	};
3958 
3959 	vk::DescriptorSetUpdateBuilder updateBuilder;
3960 	updateBuilder.writeSingle(descriptorSet.get(), vk::DescriptorSetUpdateBuilder::Location::binding(0u), vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &ssboBufferInfo);
3961 	updateBuilder.update(vkd, device);
3962 
3963 	// Clear buffer.
3964 	auto& ssboAlloc	= ssbo.getAllocation();
3965 	void* ssboPtr	= ssboAlloc.getHostPtr();
3966 	deMemset(ssboPtr, 0, static_cast<size_t>(ssboSize));
3967 	vk::flushAlloc(vkd, device, ssboAlloc);
3968 
3969 	// Run pipelines.
3970 	const auto cmdPool		= vk::makeCommandPool(vkd, device, queueIndex);
3971 	const auto cmdBUfferPtr	= vk::allocateCommandBuffer(vkd, device, cmdPool.get(), vk::VK_COMMAND_BUFFER_LEVEL_PRIMARY);
3972 	const auto cmdBuffer	= cmdBUfferPtr.get();
3973 
3974 	vk::beginCommandBuffer(vkd, cmdBuffer);
3975 
3976 	// Run the main test shader.
3977 	const auto hostToComputeBarrier = vk::makeBufferMemoryBarrier(vk::VK_ACCESS_HOST_WRITE_BIT, vk::VK_ACCESS_SHADER_WRITE_BIT, ssbo.get(), 0ull, VK_WHOLE_SIZE);
3978 	vkd.cmdPipelineBarrier(cmdBuffer, vk::VK_PIPELINE_STAGE_HOST_BIT, vk::VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0u, 0u, nullptr, 1u, &hostToComputeBarrier, 0u, nullptr);
3979 
3980 	testPipeline.bind(cmdBuffer);
3981 	vkd.cmdBindDescriptorSets(cmdBuffer, vk::VK_PIPELINE_BIND_POINT_COMPUTE, testPipeline.getPipelineLayout(), 0u, 1u, &descriptorSet.get(), 0u, nullptr);
3982 	vkd.cmdDispatch(cmdBuffer, 1u, 1u, 1u);
3983 
3984 	const auto computeToHostBarrier = vk::makeBufferMemoryBarrier(vk::VK_ACCESS_SHADER_WRITE_BIT, vk::VK_ACCESS_HOST_READ_BIT, ssbo.get(), 0ull, VK_WHOLE_SIZE);
3985 	vkd.cmdPipelineBarrier(cmdBuffer, vk::VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, vk::VK_PIPELINE_STAGE_HOST_BIT, 0u, 0u, nullptr, 1u, &computeToHostBarrier, 0u, nullptr);
3986 
3987 	vk::endCommandBuffer(vkd, cmdBuffer);
3988 	vk::submitCommandsAndWait(vkd, device, queue, cmdBuffer);
3989 
3990 	// Verify buffer contents.
3991 	vk::invalidateAlloc(vkd, device, ssboAlloc);
3992 	std::unique_ptr<deUint32[]>	valuesArray	(new deUint32[invocations]);
3993 	deUint32*					valuesPtr	= valuesArray.get();
3994 	deMemcpy(valuesPtr, ssboPtr, static_cast<size_t>(ssboSize));
3995 
3996 	std::string	errorMsg;
3997 	bool		ok			= true;
3998 
3999 	for (size_t i = 0; i < invocations; ++i)
4000 	{
4001 		if (valuesPtr[i] != 1u)
4002 		{
4003 			ok			= false;
4004 			errorMsg	= "Found invalid value for invocation index " + de::toString(i) + ": expected 1u and found " + de::toString(valuesPtr[i]);
4005 			break;
4006 		}
4007 	}
4008 
4009 	if (!ok)
4010 		return tcu::TestStatus::fail(errorMsg);
4011 	return tcu::TestStatus::pass("Pass");
4012 }
4013 
4014 namespace EmptyShaderTest
4015 {
4016 
checkSupport(Context & context,vk::ComputePipelineConstructionType computePipelineConstructionType)4017 void checkSupport (Context& context, vk::ComputePipelineConstructionType computePipelineConstructionType)
4018 {
4019 	checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(), computePipelineConstructionType);
4020 }
4021 
createProgram(SourceCollections & dst,vk::ComputePipelineConstructionType)4022 void createProgram (SourceCollections& dst, vk::ComputePipelineConstructionType)
4023 {
4024 	dst.glslSources.add("comp") << glu::ComputeSource(
4025 		"#version 310 es\n"
4026 		"layout (local_size_x = 1) in;\n"
4027 		"void main (void) {}\n"
4028 	);
4029 }
4030 
createTest(Context & context,vk::ComputePipelineConstructionType computePipelineConstructionType)4031 tcu::TestStatus createTest (Context& context, vk::ComputePipelineConstructionType computePipelineConstructionType)
4032 {
4033 	const DeviceInterface&	vk					= context.getDeviceInterface();
4034 	const VkDevice			device				= context.getDevice();
4035 	const VkQueue			queue				= context.getUniversalQueue();
4036 	const deUint32			queueFamilyIndex	= context.getUniversalQueueFamilyIndex();
4037 
4038 	ComputePipelineWrapper			pipeline		(vk, device, computePipelineConstructionType, context.getBinaryCollection().get("comp"));
4039 	pipeline.buildPipeline();
4040 
4041 	const Unique<VkCommandPool>		cmdPool			(makeCommandPool(vk, device, queueFamilyIndex));
4042 	const Unique<VkCommandBuffer>	cmdBuffer		(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
4043 
4044 	// Start recording commands
4045 
4046 	beginCommandBuffer(vk, *cmdBuffer);
4047 
4048 	pipeline.bind(*cmdBuffer);
4049 
4050 	const tcu::IVec3 workGroups(1, 1, 1);
4051 	vk.cmdDispatch(*cmdBuffer, workGroups.x(), workGroups.y(), workGroups.z());
4052 
4053 	endCommandBuffer(vk, *cmdBuffer);
4054 
4055 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
4056 
4057 	return tcu::TestStatus::pass("Compute succeeded");
4058 }
4059 
4060 } // EmptyShaderTest ns
4061 
4062 namespace ComputeOnlyQueueTests
4063 {
4064 
getComputeOnlyQueueFamily(Context & context)4065 tcu::Maybe<uint32_t> getComputeOnlyQueueFamily(Context& context)
4066 {
4067 	bool foundQueue = false;
4068 	uint32_t index = 0;
4069 
4070 	auto queueFamilies = getPhysicalDeviceQueueFamilyProperties(context.getInstanceInterface(), context.getPhysicalDevice());
4071 
4072 	for (const auto &queueFamily: queueFamilies)
4073 	{
4074 		if ((queueFamily.queueFlags & VK_QUEUE_COMPUTE_BIT) &&
4075 			!(queueFamily.queueFlags & VK_QUEUE_GRAPHICS_BIT))
4076 		{
4077 			foundQueue = true;
4078 			break;
4079 		} else {
4080 			index++;
4081 		}
4082 	}
4083 	if (!foundQueue)
4084 	{
4085 		return tcu::Maybe<uint32_t>();
4086 	} else {
4087 		return index;
4088 	}
4089 }
4090 
4091 // Creates a device that has a queue for compute capabilities without graphics.
createComputeOnlyDevice(Context & context,uint32_t & queueFamilyIndex)4092 Move<VkDevice> createComputeOnlyDevice(Context& context, uint32_t& queueFamilyIndex)
4093 {
4094 	const auto&	instanceDriver		= context.getInstanceInterface();
4095 	const auto	physicalDevice		= context.getPhysicalDevice();
4096 	const auto	queueFamilies		= getPhysicalDeviceQueueFamilyProperties(instanceDriver, physicalDevice);
4097 
4098 	// One queue family without a graphics bit should be found, since this is checked in checkSupport.
4099 	queueFamilyIndex = getComputeOnlyQueueFamily(context).get();
4100 
4101 	const float									queuePriority				= 1.0f;
4102 	const VkDeviceQueueCreateInfo				deviceQueueCreateInfos		= {
4103 		VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,	// VkStructureType				sType;
4104 		nullptr,									// const void*					pNext;
4105 		(VkDeviceQueueCreateFlags)0u,				// VkDeviceQueueCreateFlags		flags;
4106 		queueFamilyIndex,							// uint32_t						queueFamilyIndex;
4107 		1u,											// uint32_t						queueCount;
4108 		&queuePriority,								// const float*					pQueuePriorities;
4109 	};
4110 
4111 	void* pNext = nullptr;
4112 #ifdef CTS_USES_VULKANSC
4113 	VkDeviceObjectReservationCreateInfo memReservationInfo =
4114 		context.getTestContext().getCommandLine().isSubProcess() ? context.getResourceInterface()->getStatMax() : resetDeviceObjectReservationCreateInfo();
4115 	pNext = &memReservationInfo;
4116 
4117 	VkPipelineCacheCreateInfo			pcCI;
4118 	std::vector<VkPipelinePoolSize>		poolSizes;
4119 	if (context.getTestContext().getCommandLine().isSubProcess())
4120 	{
4121 		if (context.getResourceInterface()->getCacheDataSize() > 0)
4122 		{
4123 			pcCI =
4124 			{
4125 				VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO,				// VkStructureType				sType;
4126 				nullptr,													// const void*					pNext;
4127 				VK_PIPELINE_CACHE_CREATE_READ_ONLY_BIT |
4128 					VK_PIPELINE_CACHE_CREATE_USE_APPLICATION_STORAGE_BIT,	// VkPipelineCacheCreateFlags	flags;
4129 				context.getResourceInterface()->getCacheDataSize(),			// deUintptr					initialDataSize;
4130 				context.getResourceInterface()->getCacheData()				// const void*					pInitialData;
4131 			};
4132 			memReservationInfo.pipelineCacheCreateInfoCount		= 1;
4133 			memReservationInfo.pPipelineCacheCreateInfos		= &pcCI;
4134 		}
4135 		poolSizes = context.getResourceInterface()->getPipelinePoolSizes();
4136 		if (!poolSizes.empty())
4137 		{
4138 			memReservationInfo.pipelinePoolSizeCount		= deUint32(poolSizes.size());
4139 			memReservationInfo.pPipelinePoolSizes			= poolSizes.data();
4140 		}
4141 	}
4142 #endif // CTS_USES_VULKANSC
4143 	const VkDeviceCreateInfo deviceCreateInfo =
4144 	{
4145 		VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,			// VkStructureType					sType;
4146 		pNext,											// const void*						pNext;
4147 		(VkDeviceCreateFlags)0u,						// VkDeviceCreateFlags				flags;
4148 		1,												// uint32_t							queueCreateInfoCount;
4149 		&deviceQueueCreateInfos,						// const VkDeviceQueueCreateInfo*	pQueueCreateInfos;
4150 		0u,												// uint32_t							enabledLayerCount;
4151 		nullptr,										// const char* const*				ppEnabledLayerNames;
4152 		0,												// uint32_t							enabledExtensionCount;
4153 		nullptr,										// const char* const*				ppEnabledExtensionNames;
4154 		nullptr,										// const VkPhysicalDeviceFeatures*	pEnabledFeatures;
4155 	};
4156 
4157 	return vkt::createCustomDevice(context.getTestContext().getCommandLine().isValidationEnabled(),
4158 								   context.getPlatformInterface(),
4159 								   context.getInstance(),
4160 								   instanceDriver, physicalDevice, &deviceCreateInfo);
4161 }
4162 
4163 class SecondaryCommandBufferComputeOnlyTest : public vkt::TestCase {
4164 public:
SecondaryCommandBufferComputeOnlyTest(tcu::TestContext & context,const std::string & name)4165 	SecondaryCommandBufferComputeOnlyTest(tcu::TestContext& context, const std::string& name)
4166 		: vkt::TestCase(context, name)
4167 	{};
4168 
4169 	void            initPrograms            (SourceCollections& programCollection) const override;
4170 	TestInstance*   createInstance          (Context& context) const override;
4171 	void            checkSupport            (Context& context) const override;
4172 };
4173 
4174 class SecondaryCommandBufferComputeOnlyTestInstance : public vkt::TestInstance {
4175 public:
SecondaryCommandBufferComputeOnlyTestInstance(Context & context)4176 	SecondaryCommandBufferComputeOnlyTestInstance(Context& context) : vkt::TestInstance(context)
4177 	{ };
4178 	virtual tcu::TestStatus iterate(void);
4179 };
4180 
initPrograms(SourceCollections & collection) const4181 void SecondaryCommandBufferComputeOnlyTest::initPrograms(SourceCollections& collection) const {
4182 		{
4183 		std::ostringstream src;
4184 		src << glu::getGLSLVersionDeclaration(glu::GLSL_VERSION_450) << "\n"
4185 			<< "layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;\n"
4186 			<< "layout(set = 0, binding = 0, std430) buffer Out\n"
4187 			<< "{\n"
4188 			<< "	uint data[];\n"
4189 			<< "};\n"
4190 			<< "void main (void)\n"
4191 			<< "{\n"
4192 			<< "data[0] = 1;"
4193 			<< "}\n";
4194 		collection.glslSources.add("comp") << glu::ComputeSource(src.str());
4195 	}
4196 }
4197 
4198 
createInstance(Context & context) const4199 TestInstance* SecondaryCommandBufferComputeOnlyTest::createInstance(Context& context) const {
4200 	return new SecondaryCommandBufferComputeOnlyTestInstance(context);
4201 }
4202 
checkSupport(Context & context) const4203 void SecondaryCommandBufferComputeOnlyTest::checkSupport(Context& context) const {
4204 	// Find at least one queue family that supports compute queue but does NOT support graphics queue.
4205 	if (!getComputeOnlyQueueFamily(context))
4206 		TCU_THROW(NotSupportedError, "No queue family found that only supports compute queue.");
4207 }
4208 
iterate()4209 tcu::TestStatus SecondaryCommandBufferComputeOnlyTestInstance::iterate()
4210 {
4211 	const InstanceInterface&	vki						= m_context.getInstanceInterface();
4212 #ifdef CTS_USES_VULKANSC
4213 	de::MovePtr<DeviceDriverSC, DeinitDeviceDeleter> deviceDriver;
4214 #else
4215 	de::MovePtr<DeviceDriver> deviceDriver;
4216 #endif // CTS_USES_VULKANSC
4217 	VkDevice device;
4218 	uint32_t queueFamilyIndex;
4219 	auto customDevice = createComputeOnlyDevice(m_context, queueFamilyIndex);
4220 	device = customDevice.get();
4221 #ifndef CTS_USES_VULKANSC
4222 	deviceDriver = de::MovePtr<DeviceDriver>(new DeviceDriver(m_context.getPlatformInterface(), m_context.getInstance(), device, m_context.getUsedApiVersion()));
4223 #else
4224 	deviceDriver = de::MovePtr<DeviceDriverSC, DeinitDeviceDeleter>(new DeviceDriverSC(m_context.getPlatformInterface(), m_context.getInstance(), device,
4225 		m_context.getTestContext().getCommandLine(), m_context.getResourceInterface(), m_context.getDeviceVulkanSC10Properties(),
4226 		m_context.getDeviceProperties(), m_context.getUsedApiVersion()), DeinitDeviceDeleter(m_context.getResourceInterface().get(), device));
4227 #endif // CTS_USES_VULKANSC
4228 
4229 	const DeviceInterface& vkdi = *deviceDriver;
4230 
4231 	auto queue		= getDeviceQueue(vkdi, device, queueFamilyIndex, 0u);
4232 	auto allocator	= de::MovePtr<Allocator>(new SimpleAllocator(vkdi, device, getPhysicalDeviceMemoryProperties(vki, m_context.getPhysicalDevice())));
4233 
4234 	const auto			bufferSize	= static_cast<VkDeviceSize>(sizeof(uint32_t));
4235 	BufferWithMemory	buffer			(vkdi, device, *allocator.get(), makeBufferCreateInfo(bufferSize, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
4236 	auto&				bufferAlloc		= buffer.getAllocation();
4237 	void*				bufferData		= bufferAlloc.getHostPtr();
4238 	deMemset(bufferData, 0, sizeof(uint32_t));
4239 	flushAlloc(vkdi, device, bufferAlloc);
4240 
4241 	DescriptorSetLayoutBuilder layoutBuilder;
4242 	layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT);
4243 	Unique<VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vkdi, device));
4244 
4245 	DescriptorPoolBuilder poolBuilder;
4246 	poolBuilder.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
4247 	const auto descriptorPool		= poolBuilder.build(vkdi, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1);
4248 	const auto descriptorSetBuffer	= makeDescriptorSet(vkdi, device, descriptorPool.get(), descriptorSetLayout.get());
4249 
4250 	// Update descriptor sets.
4251 	DescriptorSetUpdateBuilder updater;
4252 
4253 	const auto bufferInfo = makeDescriptorBufferInfo(buffer.get(), 0ull, bufferSize);
4254 	updater.writeSingle(descriptorSetBuffer.get(), DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferInfo);
4255 
4256 	updater.update(vkdi, device);
4257 
4258 	auto shader = createShaderModule(vkdi, device, m_context.getBinaryCollection().get("comp"));
4259 	// Create compute pipeline
4260 	const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vkdi, device, *descriptorSetLayout));
4261 	const Unique<VkPipeline> computePipeline(makeComputePipeline(vkdi, device, *pipelineLayout, *shader));
4262 
4263 	// Create command buffer
4264 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vkdi, device, queueFamilyIndex));
4265 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vkdi, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
4266 	const Unique<VkCommandBuffer> cmdBuffer2(allocateCommandBuffer(vkdi, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_SECONDARY));
4267 
4268 	const VkCommandBufferInheritanceInfo bufferInheritanceInfo
4269 	{
4270 		VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_INFO,					// VkStructureType					sType;
4271 		nullptr,															// const void*						pNext;
4272 		VK_NULL_HANDLE,														// VkRenderPass						renderPass;
4273 		0u,																	// deUint32							subpass;
4274 		VK_NULL_HANDLE,														// VkFramebuffer					framebuffer;
4275 		VK_FALSE,															// VkBool32							occlusionQueryEnable;
4276 		(VkQueryControlFlags)0u,											// VkQueryControlFlags				queryFlags;
4277 		(VkQueryPipelineStatisticFlags)0u									// VkQueryPipelineStatisticFlags	pipelineStatistics;
4278 	};
4279 
4280 	VkCommandBufferUsageFlags usageFlags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
4281 	const VkCommandBufferBeginInfo commandBufBeginParams
4282 	{
4283 		VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,			// VkStructureType					sType;
4284 		nullptr,												// const void*						pNext;
4285 		usageFlags,												// VkCommandBufferUsageFlags		flags;
4286 		&bufferInheritanceInfo
4287 	};
4288 
4289 	beginCommandBuffer(vkdi, cmdBuffer.get());
4290 	vkdi.beginCommandBuffer(cmdBuffer2.get(), &commandBufBeginParams);
4291 	vkdi.cmdBindPipeline(cmdBuffer2.get(), VK_PIPELINE_BIND_POINT_COMPUTE, computePipeline.get());
4292 	vkdi.cmdBindDescriptorSets(cmdBuffer2.get(), VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayout.get(), 0u, 1, &descriptorSetBuffer.get(), 0u, nullptr);
4293 	vkdi.cmdDispatch(cmdBuffer2.get(), 1, 1, 1);
4294 	endCommandBuffer(vkdi, cmdBuffer2.get());
4295 	vkdi.cmdExecuteCommands(cmdBuffer.get(), 1, &cmdBuffer2.get());
4296 	const VkBufferMemoryBarrier renderBufferBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, buffer.get(), 0ull, bufferSize);
4297 	cmdPipelineBufferMemoryBarrier(vkdi, cmdBuffer.get(), VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, &renderBufferBarrier);
4298 	endCommandBuffer(vkdi, cmdBuffer.get());
4299 	submitCommandsAndWait(vkdi, device, queue, cmdBuffer.get());
4300 
4301 	invalidateAlloc(vkdi, device, bufferAlloc);
4302 
4303 	uint32_t result = 0;
4304 	deMemcpy(&result, bufferData, sizeof(uint32_t));
4305 	if (result != 1)
4306 	{
4307 		return tcu::TestStatus::pass("value of buffer unexpected");
4308 	}
4309 
4310 	return tcu::TestStatus::pass("passed");
4311 }
4312 
4313 };
4314 
4315 } // anonymous
4316 
createFunctionCaseWithPrograms2(tcu::TestContext & testCtx,tcu::TestNodeType type,const std::string & name,FunctionSupport0::Function checkSupport,FunctionPrograms0::Function initPrograms,FunctionInstance0::Function testFunction)4317 inline TestCase* createFunctionCaseWithPrograms2 (tcu::TestContext&				testCtx,
4318 												 tcu::TestNodeType				type,
4319 												 const std::string&				name,
4320 												 FunctionSupport0::Function		checkSupport,
4321 												 FunctionPrograms0::Function	initPrograms,
4322 												 FunctionInstance0::Function	testFunction)
4323 {
4324 	return new InstanceFactory1WithSupport<FunctionInstance0, FunctionInstance0::Function, FunctionSupport0, FunctionPrograms0>(
4325 		testCtx, type, name,FunctionPrograms0(initPrograms), testFunction, checkSupport);
4326 }
createBasicComputeShaderTests(tcu::TestContext & testCtx,vk::ComputePipelineConstructionType computePipelineConstructionType)4327 tcu::TestCaseGroup* createBasicComputeShaderTests (tcu::TestContext& testCtx, vk::ComputePipelineConstructionType computePipelineConstructionType)
4328 {
4329 	// Basic compute tests
4330 	de::MovePtr<tcu::TestCaseGroup> basicComputeTests(new tcu::TestCaseGroup(testCtx, "basic"));
4331 
4332 	// Shader that does nothing
4333 	addFunctionCaseWithPrograms(basicComputeTests.get(), "empty_shader", EmptyShaderTest::checkSupport, EmptyShaderTest::createProgram, EmptyShaderTest::createTest, computePipelineConstructionType);
4334 
4335 	// Concurrent compute test
4336 	basicComputeTests->addChild(new ConcurrentCompute(testCtx, "concurrent_compute", computePipelineConstructionType));
4337 
4338 	// Use an empty workgroup with size 0 on the X axis
4339 	basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_x", tcu::UVec3(0u, 2u, 3u), computePipelineConstructionType));
4340 	// Use an empty workgroup with size 0 on the Y axis
4341 	basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_y", tcu::UVec3(2u, 0u, 3u), computePipelineConstructionType));
4342 	// Use an empty workgroup with size 0 on the Z axis
4343 	basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_z", tcu::UVec3(2u, 3u, 0u), computePipelineConstructionType));
4344 	// Use an empty workgroup with size 0 on the X, Y and Z axes
4345 	basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_all", tcu::UVec3(0u, 0u, 0u), computePipelineConstructionType));
4346 
4347 	// Use the maximum work group size on the X axis
4348 	basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_x", MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::X}, computePipelineConstructionType));
4349 	// Use the maximum work group size on the Y axis
4350 	basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_y", MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::Y}, computePipelineConstructionType));
4351 	// Use the maximum work group size on the Z axis
4352 	basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_z", MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::Z}, computePipelineConstructionType));
4353 
4354 	// Concurrent compute test
4355 	basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx,	"ubo_to_ssbo_single_invocation",	256,	tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1), computePipelineConstructionType));
4356 	basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx,	"ubo_to_ssbo_single_group",	1024,	tcu::IVec3(2,1,4),	tcu::IVec3(1,1,1), computePipelineConstructionType));
4357 	basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx,	"ubo_to_ssbo_multiple_invocations",	1024,	tcu::IVec3(1,1,1),	tcu::IVec3(2,4,1), computePipelineConstructionType));
4358 	basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx,	"ubo_to_ssbo_multiple_groups",	1024,	tcu::IVec3(1,4,2),	tcu::IVec3(2,2,4), computePipelineConstructionType));
4359 
4360 	// Concurrent compute test
4361 	basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx,	"copy_ssbo_single_invocation",	256,	tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1), computePipelineConstructionType));
4362 	basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx,	"copy_ssbo_multiple_invocations",	1024,	tcu::IVec3(1,1,1),	tcu::IVec3(2,4,1), computePipelineConstructionType));
4363 	basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx,	"copy_ssbo_multiple_groups",	1024,	tcu::IVec3(1,4,2),	tcu::IVec3(2,2,4), computePipelineConstructionType));
4364 
4365 	// Read and write same SSBO
4366 	basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx,	"ssbo_rw_single_invocation", 256,	true,	tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1), computePipelineConstructionType));
4367 	basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx,	"ssbo_rw_multiple_groups",		1024,	true,	tcu::IVec3(1,4,2),	tcu::IVec3(2,2,4), computePipelineConstructionType));
4368 	basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx,	"ssbo_unsized_arr_single_invocation",		256,	false,	tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1), computePipelineConstructionType));
4369 	basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx,	"ssbo_unsized_arr_multiple_groups",		1024,	false,	tcu::IVec3(1,4,2),	tcu::IVec3(2,2,4), computePipelineConstructionType));
4370 
4371 	// Write to multiple SSBOs
4372 	basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx,	"write_multiple_arr_single_invocation", 256,	true,	tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1), computePipelineConstructionType));
4373 	basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx,	"write_multiple_arr_multiple_groups",	1024,	true,	tcu::IVec3(1,4,2),	tcu::IVec3(2,2,4), computePipelineConstructionType));
4374 	basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx,	"write_multiple_unsized_arr_single_invocation",	256,	false,	tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1), computePipelineConstructionType));
4375 	basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx,	"write_multiple_unsized_arr_multiple_groups",	1024,	false,	tcu::IVec3(1,4,2),	tcu::IVec3(2,2,4), computePipelineConstructionType));
4376 
4377 	// SSBO local barrier usage
4378 	basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx,	"ssbo_local_barrier_single_invocation", tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1), computePipelineConstructionType));
4379 	basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx,	"ssbo_local_barrier_single_group",	tcu::IVec3(3,2,5),	tcu::IVec3(1,1,1), computePipelineConstructionType));
4380 	basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx,	"ssbo_local_barrier_multiple_groups",	tcu::IVec3(3,4,1),	tcu::IVec3(2,7,3), computePipelineConstructionType));
4381 
4382 	// SSBO memory barrier usage
4383 	basicComputeTests->addChild(new SSBOBarrierTest(testCtx,	"ssbo_cmd_barrier_single", tcu::IVec3(1,1,1),	computePipelineConstructionType));
4384 	basicComputeTests->addChild(new SSBOBarrierTest(testCtx,	"ssbo_cmd_barrier_multiple",	tcu::IVec3(11,5,7), computePipelineConstructionType));
4385 
4386 	// Basic shared variable usage
4387 	basicComputeTests->addChild(new SharedVarTest(testCtx,	"shared_var_single_invocation", tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1), computePipelineConstructionType));
4388 	basicComputeTests->addChild(new SharedVarTest(testCtx,	"shared_var_single_group",	tcu::IVec3(3,2,5),	tcu::IVec3(1,1,1), computePipelineConstructionType));
4389 	basicComputeTests->addChild(new SharedVarTest(testCtx,	"shared_var_multiple_invocations",	tcu::IVec3(1,1,1),	tcu::IVec3(2,5,4), computePipelineConstructionType));
4390 	basicComputeTests->addChild(new SharedVarTest(testCtx,	"shared_var_multiple_groups",	tcu::IVec3(3,4,1),	tcu::IVec3(2,7,3), computePipelineConstructionType));
4391 
4392 	// Atomic operation with shared var
4393 	basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx,	"shared_atomic_op_single_invocation", tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1), computePipelineConstructionType));
4394 	basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx,	"shared_atomic_op_single_group",		tcu::IVec3(3,2,5),	tcu::IVec3(1,1,1), computePipelineConstructionType));
4395 	basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx,	"shared_atomic_op_multiple_invocations",		tcu::IVec3(1,1,1),	tcu::IVec3(2,5,4), computePipelineConstructionType));
4396 	basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx,	"shared_atomic_op_multiple_groups",		tcu::IVec3(3,4,1),	tcu::IVec3(2,7,3), computePipelineConstructionType));
4397 
4398 	// Image to SSBO copy
4399 	basicComputeTests->addChild(new CopyImageToSSBOTest(testCtx,	"copy_image_to_ssbo_small", tcu::IVec2(1,1),	tcu::IVec2(64,64),		computePipelineConstructionType));
4400 	basicComputeTests->addChild(new CopyImageToSSBOTest(testCtx,	"copy_image_to_ssbo_large",	tcu::IVec2(2,4),	tcu::IVec2(512,512),	computePipelineConstructionType));
4401 
4402 	// SSBO to image copy
4403 	basicComputeTests->addChild(new CopySSBOToImageTest(testCtx,	"copy_ssbo_to_image_small", tcu::IVec2(1, 1),	tcu::IVec2(64, 64),		computePipelineConstructionType));
4404 	basicComputeTests->addChild(new CopySSBOToImageTest(testCtx,	"copy_ssbo_to_image_large",	tcu::IVec2(2, 4),	tcu::IVec2(512, 512),	computePipelineConstructionType));
4405 
4406 	// Atomic operation with image
4407 	basicComputeTests->addChild(new ImageAtomicOpTest(testCtx,	"image_atomic_op_local_size_1", 1,	tcu::IVec2(64,64),	computePipelineConstructionType));
4408 	basicComputeTests->addChild(new ImageAtomicOpTest(testCtx,	"image_atomic_op_local_size_8",	8,	tcu::IVec2(64,64),	computePipelineConstructionType));
4409 
4410 	// Image barrier
4411 	basicComputeTests->addChild(new ImageBarrierTest(testCtx,	"image_barrier_single", tcu::IVec2(1,1),	computePipelineConstructionType));
4412 	basicComputeTests->addChild(new ImageBarrierTest(testCtx,	"image_barrier_multiple",	tcu::IVec2(64,64),	computePipelineConstructionType));
4413 
4414 	// Test secondary command buffers in compute only queues
4415 	basicComputeTests->addChild(new ComputeOnlyQueueTests::SecondaryCommandBufferComputeOnlyTest(testCtx, "secondary_compute_only_queue"));
4416 
4417 #ifndef CTS_USES_VULKANSC
4418 	basicComputeTests->addChild(cts_amber::createAmberTestCase(testCtx, "write_ssbo_array", "", "compute", "write_ssbo_array.amber"));
4419 	basicComputeTests->addChild(cts_amber::createAmberTestCase(testCtx, "branch_past_barrier", "", "compute", "branch_past_barrier.amber"));
4420 	basicComputeTests->addChild(cts_amber::createAmberTestCase(testCtx,"webgl_spirv_loop", "Simple SPIR-V loop from a WebGL example that caused problems in some implementations", "compute", "webgl_spirv_loop.amber"));
4421 #endif
4422 
4423 	return basicComputeTests.release();
4424 }
4425 
createBasicDeviceGroupComputeShaderTests(tcu::TestContext & testCtx,vk::ComputePipelineConstructionType computePipelineConstructionType)4426 tcu::TestCaseGroup* createBasicDeviceGroupComputeShaderTests (tcu::TestContext& testCtx, vk::ComputePipelineConstructionType computePipelineConstructionType)
4427 {
4428 	de::MovePtr<tcu::TestCaseGroup> deviceGroupComputeTests(new tcu::TestCaseGroup(testCtx, "device_group"));
4429 
4430 	deviceGroupComputeTests->addChild(new DispatchBaseTest(testCtx,	"dispatch_base",	32768,	tcu::IVec3(4,2,4),	tcu::IVec3(16,8,8),	tcu::IVec3(4,8,8), computePipelineConstructionType, false));
4431 #ifndef CTS_USES_VULKANSC
4432 	deviceGroupComputeTests->addChild(new DispatchBaseTest(testCtx, "dispatch_base_maintenance5",	32768, tcu::IVec3(4, 2, 4), tcu::IVec3(16, 8, 8), tcu::IVec3(4, 8, 8), computePipelineConstructionType, true));
4433 #endif
4434 	deviceGroupComputeTests->addChild(new DeviceIndexTest(testCtx,	"device_index",	96,		tcu::IVec3(3,2,1),	tcu::IVec3(2,4,1), computePipelineConstructionType));
4435 
4436 	return deviceGroupComputeTests.release();
4437 
4438 }
4439 } // compute
4440 } // vkt
4441