• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*------------------------------------------------------------------------
2  * Vulkan Conformance Tests
3  * ------------------------
4  *
5  * Copyright (c) 2019 The Khronos Group Inc.
6  * Copyright (c) 2019 The Android Open Source Project
7  *
8  * Licensed under the Apache License, Version 2.0 (the "License");
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  *
12  *      http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  *
20  *//*!
21  * \file
22  * \brief Compute Shader Tests
23  *//*--------------------------------------------------------------------*/
24 
25 #include "vktComputeBasicComputeShaderTests.hpp"
26 #include "vktTestCase.hpp"
27 #include "vktTestCaseUtil.hpp"
28 #include "vktComputeTestsUtil.hpp"
29 #include "vktCustomInstancesDevices.hpp"
30 #include "vktAmberTestCase.hpp"
31 
32 #include "vkDefs.hpp"
33 #include "vkRef.hpp"
34 #include "vkRefUtil.hpp"
35 #include "vkPlatform.hpp"
36 #include "vkPrograms.hpp"
37 #include "vkRefUtil.hpp"
38 #include "vkMemUtil.hpp"
39 #include "vkBarrierUtil.hpp"
40 #include "vkQueryUtil.hpp"
41 #include "vkBuilderUtil.hpp"
42 #include "vkTypeUtil.hpp"
43 #include "vkDeviceUtil.hpp"
44 #include "vkCmdUtil.hpp"
45 #include "vkObjUtil.hpp"
46 #include "vkBufferWithMemory.hpp"
47 #include "vkSafetyCriticalUtil.hpp"
48 #include "vkImageWithMemory.hpp"
49 
50 #include "tcuCommandLine.hpp"
51 #include "tcuTestLog.hpp"
52 
53 #include "deStringUtil.hpp"
54 #include "deUniquePtr.hpp"
55 #include "deRandom.hpp"
56 
57 #include <vector>
58 #include <memory>
59 
60 using namespace vk;
61 
62 namespace vkt
63 {
64 namespace compute
65 {
66 namespace
67 {
68 
69 template<typename T, int size>
multiplyComponents(const tcu::Vector<T,size> & v)70 T multiplyComponents (const tcu::Vector<T, size>& v)
71 {
72 	T accum = 1;
73 	for (int i = 0; i < size; ++i)
74 		accum *= v[i];
75 	return accum;
76 }
77 
78 template<typename T>
squared(const T & a)79 inline T squared (const T& a)
80 {
81 	return a * a;
82 }
83 
make2DImageCreateInfo(const tcu::IVec2 & imageSize,const VkImageUsageFlags usage)84 inline VkImageCreateInfo make2DImageCreateInfo (const tcu::IVec2& imageSize, const VkImageUsageFlags usage)
85 {
86 	const VkImageCreateInfo imageParams =
87 	{
88 		VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,				// VkStructureType			sType;
89 		DE_NULL,											// const void*				pNext;
90 		0u,													// VkImageCreateFlags		flags;
91 		VK_IMAGE_TYPE_2D,									// VkImageType				imageType;
92 		VK_FORMAT_R32_UINT,									// VkFormat					format;
93 		vk::makeExtent3D(imageSize.x(), imageSize.y(), 1),	// VkExtent3D				extent;
94 		1u,													// deUint32					mipLevels;
95 		1u,													// deUint32					arrayLayers;
96 		VK_SAMPLE_COUNT_1_BIT,								// VkSampleCountFlagBits	samples;
97 		VK_IMAGE_TILING_OPTIMAL,							// VkImageTiling			tiling;
98 		usage,												// VkImageUsageFlags		usage;
99 		VK_SHARING_MODE_EXCLUSIVE,							// VkSharingMode			sharingMode;
100 		0u,													// deUint32					queueFamilyIndexCount;
101 		DE_NULL,											// const deUint32*			pQueueFamilyIndices;
102 		VK_IMAGE_LAYOUT_UNDEFINED,							// VkImageLayout			initialLayout;
103 	};
104 	return imageParams;
105 }
106 
makeBufferImageCopy(const tcu::IVec2 & imageSize)107 inline VkBufferImageCopy makeBufferImageCopy(const tcu::IVec2& imageSize)
108 {
109 	return compute::makeBufferImageCopy(vk::makeExtent3D(imageSize.x(), imageSize.y(), 1), 1u);
110 }
111 
112 enum BufferType
113 {
114 	BUFFER_TYPE_UNIFORM,
115 	BUFFER_TYPE_SSBO,
116 };
117 
118 class SharedVarTest : public vkt::TestCase
119 {
120 public:
121 						SharedVarTest	(tcu::TestContext&		testCtx,
122 										 const std::string&		name,
123 										 const std::string&		description,
124 										 const tcu::IVec3&		localSize,
125 										 const tcu::IVec3&		workSize);
126 
127 	void				initPrograms	(SourceCollections&		sourceCollections) const;
128 	TestInstance*		createInstance	(Context&				context) const;
129 
130 private:
131 	const tcu::IVec3	m_localSize;
132 	const tcu::IVec3	m_workSize;
133 };
134 
135 class SharedVarTestInstance : public vkt::TestInstance
136 {
137 public:
138 									SharedVarTestInstance	(Context&			context,
139 															 const tcu::IVec3&	localSize,
140 															 const tcu::IVec3&	workSize);
141 
142 	tcu::TestStatus					iterate					(void);
143 
144 private:
145 	const tcu::IVec3				m_localSize;
146 	const tcu::IVec3				m_workSize;
147 };
148 
SharedVarTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)149 SharedVarTest::SharedVarTest (tcu::TestContext&		testCtx,
150 							  const std::string&	name,
151 							  const std::string&	description,
152 							  const tcu::IVec3&		localSize,
153 							  const tcu::IVec3&		workSize)
154 	: TestCase		(testCtx, name, description)
155 	, m_localSize	(localSize)
156 	, m_workSize	(workSize)
157 {
158 }
159 
initPrograms(SourceCollections & sourceCollections) const160 void SharedVarTest::initPrograms (SourceCollections& sourceCollections) const
161 {
162 	const int workGroupSize = multiplyComponents(m_localSize);
163 	const int workGroupCount = multiplyComponents(m_workSize);
164 	const int numValues = workGroupSize * workGroupCount;
165 
166 	std::ostringstream src;
167 	src << "#version 310 es\n"
168 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
169 		<< "layout(binding = 0) writeonly buffer Output {\n"
170 		<< "    uint values[" << numValues << "];\n"
171 		<< "} sb_out;\n\n"
172 		<< "shared uint offsets[" << workGroupSize << "];\n\n"
173 		<< "void main (void) {\n"
174 		<< "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
175 		<< "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
176 		<< "    uint globalOffs = localSize*globalNdx;\n"
177 		<< "    uint localOffs  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_LocalInvocationID.z + gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x;\n"
178 		<< "\n"
179 		<< "    offsets[localSize-localOffs-1u] = globalOffs + localOffs*localOffs;\n"
180 		<< "    memoryBarrierShared();\n"
181 		<< "    barrier();\n"
182 		<< "    sb_out.values[globalOffs + localOffs] = offsets[localOffs];\n"
183 		<< "}\n";
184 
185 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
186 }
187 
createInstance(Context & context) const188 TestInstance* SharedVarTest::createInstance (Context& context) const
189 {
190 	return new SharedVarTestInstance(context, m_localSize, m_workSize);
191 }
192 
SharedVarTestInstance(Context & context,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)193 SharedVarTestInstance::SharedVarTestInstance (Context& context, const tcu::IVec3& localSize, const tcu::IVec3& workSize)
194 	: TestInstance	(context)
195 	, m_localSize	(localSize)
196 	, m_workSize	(workSize)
197 {
198 }
199 
iterate(void)200 tcu::TestStatus SharedVarTestInstance::iterate (void)
201 {
202 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
203 	const VkDevice			device				= m_context.getDevice();
204 	const VkQueue			queue				= m_context.getUniversalQueue();
205 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
206 	Allocator&				allocator			= m_context.getDefaultAllocator();
207 
208 	const int workGroupSize = multiplyComponents(m_localSize);
209 	const int workGroupCount = multiplyComponents(m_workSize);
210 
211 	// Create a buffer and host-visible memory for it
212 
213 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * workGroupSize * workGroupCount;
214 	const BufferWithMemory buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
215 
216 	// Create descriptor set
217 
218 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
219 		DescriptorSetLayoutBuilder()
220 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
221 		.build(vk, device));
222 
223 	const Unique<VkDescriptorPool> descriptorPool(
224 		DescriptorPoolBuilder()
225 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
226 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
227 
228 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
229 
230 	const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
231 	DescriptorSetUpdateBuilder()
232 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
233 		.update(vk, device);
234 
235 	// Perform the computation
236 
237 	const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
238 	const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
239 	const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
240 
241 	const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
242 
243 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
244 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
245 
246 	// Start recording commands
247 
248 	beginCommandBuffer(vk, *cmdBuffer);
249 
250 	vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
251 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
252 
253 	vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
254 
255 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
256 
257 	endCommandBuffer(vk, *cmdBuffer);
258 
259 	// Wait for completion
260 
261 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
262 
263 	// Validate the results
264 
265 	const Allocation& bufferAllocation = buffer.getAllocation();
266 	invalidateAlloc(vk, device, bufferAllocation);
267 
268 	const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
269 
270 	for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
271 	{
272 		const int globalOffset = groupNdx * workGroupSize;
273 		for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
274 		{
275 			const deUint32 res = bufferPtr[globalOffset + localOffset];
276 			const deUint32 ref = globalOffset + squared(workGroupSize - localOffset - 1);
277 
278 			if (res != ref)
279 			{
280 				std::ostringstream msg;
281 				msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
282 				return tcu::TestStatus::fail(msg.str());
283 			}
284 		}
285 	}
286 	return tcu::TestStatus::pass("Compute succeeded");
287 }
288 
289 class SharedVarAtomicOpTest : public vkt::TestCase
290 {
291 public:
292 						SharedVarAtomicOpTest	(tcu::TestContext&	testCtx,
293 												 const std::string&	name,
294 												 const std::string&	description,
295 												 const tcu::IVec3&	localSize,
296 												 const tcu::IVec3&	workSize);
297 
298 	void				initPrograms			(SourceCollections& sourceCollections) const;
299 	TestInstance*		createInstance			(Context&			context) const;
300 
301 private:
302 	const tcu::IVec3	m_localSize;
303 	const tcu::IVec3	m_workSize;
304 };
305 
306 class SharedVarAtomicOpTestInstance : public vkt::TestInstance
307 {
308 public:
309 									SharedVarAtomicOpTestInstance	(Context&			context,
310 																	 const tcu::IVec3&	localSize,
311 																	 const tcu::IVec3&	workSize);
312 
313 	tcu::TestStatus					iterate							(void);
314 
315 private:
316 	const tcu::IVec3				m_localSize;
317 	const tcu::IVec3				m_workSize;
318 };
319 
SharedVarAtomicOpTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)320 SharedVarAtomicOpTest::SharedVarAtomicOpTest (tcu::TestContext&		testCtx,
321 											  const std::string&	name,
322 											  const std::string&	description,
323 											  const tcu::IVec3&		localSize,
324 											  const tcu::IVec3&		workSize)
325 	: TestCase		(testCtx, name, description)
326 	, m_localSize	(localSize)
327 	, m_workSize	(workSize)
328 {
329 }
330 
initPrograms(SourceCollections & sourceCollections) const331 void SharedVarAtomicOpTest::initPrograms (SourceCollections& sourceCollections) const
332 {
333 	const int workGroupSize = multiplyComponents(m_localSize);
334 	const int workGroupCount = multiplyComponents(m_workSize);
335 	const int numValues = workGroupSize * workGroupCount;
336 
337 	std::ostringstream src;
338 	src << "#version 310 es\n"
339 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
340 		<< "layout(binding = 0) writeonly buffer Output {\n"
341 		<< "    uint values[" << numValues << "];\n"
342 		<< "} sb_out;\n\n"
343 		<< "shared uint count;\n\n"
344 		<< "void main (void) {\n"
345 		<< "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
346 		<< "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
347 		<< "    uint globalOffs = localSize*globalNdx;\n"
348 		<< "\n"
349 		<< "    count = 0u;\n"
350 		<< "    memoryBarrierShared();\n"
351 		<< "    barrier();\n"
352 		<< "    uint oldVal = atomicAdd(count, 1u);\n"
353 		<< "    sb_out.values[globalOffs+oldVal] = oldVal+1u;\n"
354 		<< "}\n";
355 
356 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
357 }
358 
createInstance(Context & context) const359 TestInstance* SharedVarAtomicOpTest::createInstance (Context& context) const
360 {
361 	return new SharedVarAtomicOpTestInstance(context, m_localSize, m_workSize);
362 }
363 
SharedVarAtomicOpTestInstance(Context & context,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)364 SharedVarAtomicOpTestInstance::SharedVarAtomicOpTestInstance (Context& context, const tcu::IVec3& localSize, const tcu::IVec3& workSize)
365 	: TestInstance	(context)
366 	, m_localSize	(localSize)
367 	, m_workSize	(workSize)
368 {
369 }
370 
iterate(void)371 tcu::TestStatus SharedVarAtomicOpTestInstance::iterate (void)
372 {
373 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
374 	const VkDevice			device				= m_context.getDevice();
375 	const VkQueue			queue				= m_context.getUniversalQueue();
376 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
377 	Allocator&				allocator			= m_context.getDefaultAllocator();
378 
379 	const int workGroupSize = multiplyComponents(m_localSize);
380 	const int workGroupCount = multiplyComponents(m_workSize);
381 
382 	// Create a buffer and host-visible memory for it
383 
384 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * workGroupSize * workGroupCount;
385 	const BufferWithMemory buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
386 
387 	// Create descriptor set
388 
389 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
390 		DescriptorSetLayoutBuilder()
391 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
392 		.build(vk, device));
393 
394 	const Unique<VkDescriptorPool> descriptorPool(
395 		DescriptorPoolBuilder()
396 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
397 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
398 
399 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
400 
401 	const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
402 	DescriptorSetUpdateBuilder()
403 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
404 		.update(vk, device);
405 
406 	// Perform the computation
407 
408 	const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
409 	const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
410 	const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
411 
412 	const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
413 
414 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
415 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
416 
417 	// Start recording commands
418 
419 	beginCommandBuffer(vk, *cmdBuffer);
420 
421 	vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
422 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
423 
424 	vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
425 
426 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1u, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
427 
428 	endCommandBuffer(vk, *cmdBuffer);
429 
430 	// Wait for completion
431 
432 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
433 
434 	// Validate the results
435 
436 	const Allocation& bufferAllocation = buffer.getAllocation();
437 	invalidateAlloc(vk, device, bufferAllocation);
438 
439 	const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
440 
441 	for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
442 	{
443 		const int globalOffset = groupNdx * workGroupSize;
444 		for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
445 		{
446 			const deUint32 res = bufferPtr[globalOffset + localOffset];
447 			const deUint32 ref = localOffset + 1;
448 
449 			if (res != ref)
450 			{
451 				std::ostringstream msg;
452 				msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
453 				return tcu::TestStatus::fail(msg.str());
454 			}
455 		}
456 	}
457 	return tcu::TestStatus::pass("Compute succeeded");
458 }
459 
460 class SSBOLocalBarrierTest : public vkt::TestCase
461 {
462 public:
463 						SSBOLocalBarrierTest	(tcu::TestContext&	testCtx,
464 												 const std::string& name,
465 												 const std::string&	description,
466 												 const tcu::IVec3&	localSize,
467 												 const tcu::IVec3&	workSize);
468 
469 	void				initPrograms			(SourceCollections& sourceCollections) const;
470 	TestInstance*		createInstance			(Context&			context) const;
471 
472 private:
473 	const tcu::IVec3	m_localSize;
474 	const tcu::IVec3	m_workSize;
475 };
476 
477 class SSBOLocalBarrierTestInstance : public vkt::TestInstance
478 {
479 public:
480 									SSBOLocalBarrierTestInstance	(Context&			context,
481 																	 const tcu::IVec3&	localSize,
482 																	 const tcu::IVec3&	workSize);
483 
484 	tcu::TestStatus					iterate							(void);
485 
486 private:
487 	const tcu::IVec3				m_localSize;
488 	const tcu::IVec3				m_workSize;
489 };
490 
SSBOLocalBarrierTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)491 SSBOLocalBarrierTest::SSBOLocalBarrierTest (tcu::TestContext&	testCtx,
492 											const std::string&	name,
493 											const std::string&	description,
494 											const tcu::IVec3&	localSize,
495 											const tcu::IVec3&	workSize)
496 	: TestCase		(testCtx, name, description)
497 	, m_localSize	(localSize)
498 	, m_workSize	(workSize)
499 {
500 }
501 
initPrograms(SourceCollections & sourceCollections) const502 void SSBOLocalBarrierTest::initPrograms (SourceCollections& sourceCollections) const
503 {
504 	const int workGroupSize = multiplyComponents(m_localSize);
505 	const int workGroupCount = multiplyComponents(m_workSize);
506 	const int numValues = workGroupSize * workGroupCount;
507 
508 	std::ostringstream src;
509 	src << "#version 310 es\n"
510 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
511 		<< "layout(binding = 0) coherent buffer Output {\n"
512 		<< "    uint values[" << numValues << "];\n"
513 		<< "} sb_out;\n\n"
514 		<< "void main (void) {\n"
515 		<< "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
516 		<< "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
517 		<< "    uint globalOffs = localSize*globalNdx;\n"
518 		<< "    uint localOffs  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_LocalInvocationID.z + gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x;\n"
519 		<< "\n"
520 		<< "    sb_out.values[globalOffs + localOffs] = globalOffs;\n"
521 		<< "    memoryBarrierBuffer();\n"
522 		<< "    barrier();\n"
523 		<< "    sb_out.values[globalOffs + ((localOffs+1u)%localSize)] += localOffs;\n"		// += so we read and write
524 		<< "    memoryBarrierBuffer();\n"
525 		<< "    barrier();\n"
526 		<< "    sb_out.values[globalOffs + ((localOffs+2u)%localSize)] += localOffs;\n"
527 		<< "}\n";
528 
529 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
530 }
531 
createInstance(Context & context) const532 TestInstance* SSBOLocalBarrierTest::createInstance (Context& context) const
533 {
534 	return new SSBOLocalBarrierTestInstance(context, m_localSize, m_workSize);
535 }
536 
SSBOLocalBarrierTestInstance(Context & context,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)537 SSBOLocalBarrierTestInstance::SSBOLocalBarrierTestInstance (Context& context, const tcu::IVec3& localSize, const tcu::IVec3& workSize)
538 	: TestInstance	(context)
539 	, m_localSize	(localSize)
540 	, m_workSize	(workSize)
541 {
542 }
543 
iterate(void)544 tcu::TestStatus SSBOLocalBarrierTestInstance::iterate (void)
545 {
546 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
547 	const VkDevice			device				= m_context.getDevice();
548 	const VkQueue			queue				= m_context.getUniversalQueue();
549 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
550 	Allocator&				allocator			= m_context.getDefaultAllocator();
551 
552 	const int workGroupSize = multiplyComponents(m_localSize);
553 	const int workGroupCount = multiplyComponents(m_workSize);
554 
555 	// Create a buffer and host-visible memory for it
556 
557 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * workGroupSize * workGroupCount;
558 	const BufferWithMemory buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
559 
560 	// Create descriptor set
561 
562 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
563 		DescriptorSetLayoutBuilder()
564 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
565 		.build(vk, device));
566 
567 	const Unique<VkDescriptorPool> descriptorPool(
568 		DescriptorPoolBuilder()
569 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
570 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
571 
572 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
573 
574 	const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
575 	DescriptorSetUpdateBuilder()
576 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
577 		.update(vk, device);
578 
579 	// Perform the computation
580 
581 	const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
582 	const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
583 	const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
584 
585 	const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
586 
587 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
588 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
589 
590 	// Start recording commands
591 
592 	beginCommandBuffer(vk, *cmdBuffer);
593 
594 	vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
595 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
596 
597 	vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
598 
599 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
600 
601 	endCommandBuffer(vk, *cmdBuffer);
602 
603 	// Wait for completion
604 
605 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
606 
607 	// Validate the results
608 
609 	const Allocation& bufferAllocation = buffer.getAllocation();
610 	invalidateAlloc(vk, device, bufferAllocation);
611 
612 	const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
613 
614 	for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
615 	{
616 		const int globalOffset = groupNdx * workGroupSize;
617 		for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
618 		{
619 			const deUint32	res		= bufferPtr[globalOffset + localOffset];
620 			const int		offs0	= localOffset - 1 < 0 ? ((localOffset + workGroupSize - 1) % workGroupSize) : ((localOffset - 1) % workGroupSize);
621 			const int		offs1	= localOffset - 2 < 0 ? ((localOffset + workGroupSize - 2) % workGroupSize) : ((localOffset - 2) % workGroupSize);
622 			const deUint32	ref		= static_cast<deUint32>(globalOffset + offs0 + offs1);
623 
624 			if (res != ref)
625 			{
626 				std::ostringstream msg;
627 				msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
628 				return tcu::TestStatus::fail(msg.str());
629 			}
630 		}
631 	}
632 	return tcu::TestStatus::pass("Compute succeeded");
633 }
634 
635 class CopyImageToSSBOTest : public vkt::TestCase
636 {
637 public:
638 						CopyImageToSSBOTest		(tcu::TestContext&	testCtx,
639 												 const std::string&	name,
640 												 const std::string&	description,
641 												 const tcu::IVec2&	localSize,
642 												 const tcu::IVec2&	imageSize);
643 
644 	void				initPrograms			(SourceCollections& sourceCollections) const;
645 	TestInstance*		createInstance			(Context&			context) const;
646 
647 private:
648 	const tcu::IVec2	m_localSize;
649 	const tcu::IVec2	m_imageSize;
650 };
651 
652 class CopyImageToSSBOTestInstance : public vkt::TestInstance
653 {
654 public:
655 									CopyImageToSSBOTestInstance		(Context&			context,
656 																	 const tcu::IVec2&	localSize,
657 																	 const tcu::IVec2&	imageSize);
658 
659 	tcu::TestStatus					iterate							(void);
660 
661 private:
662 	const tcu::IVec2				m_localSize;
663 	const tcu::IVec2				m_imageSize;
664 };
665 
CopyImageToSSBOTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize)666 CopyImageToSSBOTest::CopyImageToSSBOTest (tcu::TestContext&		testCtx,
667 										  const std::string&	name,
668 										  const std::string&	description,
669 										  const tcu::IVec2&		localSize,
670 										  const tcu::IVec2&		imageSize)
671 	: TestCase		(testCtx, name, description)
672 	, m_localSize	(localSize)
673 	, m_imageSize	(imageSize)
674 {
675 	DE_ASSERT(m_imageSize.x() % m_localSize.x() == 0);
676 	DE_ASSERT(m_imageSize.y() % m_localSize.y() == 0);
677 }
678 
initPrograms(SourceCollections & sourceCollections) const679 void CopyImageToSSBOTest::initPrograms (SourceCollections& sourceCollections) const
680 {
681 	std::ostringstream src;
682 	src << "#version 310 es\n"
683 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ") in;\n"
684 		<< "layout(binding = 1, r32ui) readonly uniform highp uimage2D u_srcImg;\n"
685 		<< "layout(binding = 0) writeonly buffer Output {\n"
686 		<< "    uint values[" << (m_imageSize.x() * m_imageSize.y()) << "];\n"
687 		<< "} sb_out;\n\n"
688 		<< "void main (void) {\n"
689 		<< "    uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
690 		<< "    uint value  = imageLoad(u_srcImg, ivec2(gl_GlobalInvocationID.xy)).x;\n"
691 		<< "    sb_out.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x] = value;\n"
692 		<< "}\n";
693 
694 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
695 }
696 
createInstance(Context & context) const697 TestInstance* CopyImageToSSBOTest::createInstance (Context& context) const
698 {
699 	return new CopyImageToSSBOTestInstance(context, m_localSize, m_imageSize);
700 }
701 
CopyImageToSSBOTestInstance(Context & context,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize)702 CopyImageToSSBOTestInstance::CopyImageToSSBOTestInstance (Context& context, const tcu::IVec2& localSize, const tcu::IVec2& imageSize)
703 	: TestInstance	(context)
704 	, m_localSize	(localSize)
705 	, m_imageSize	(imageSize)
706 {
707 }
708 
iterate(void)709 tcu::TestStatus CopyImageToSSBOTestInstance::iterate (void)
710 {
711 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
712 	const VkDevice			device				= m_context.getDevice();
713 	const VkQueue			queue				= m_context.getUniversalQueue();
714 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
715 	Allocator&				allocator			= m_context.getDefaultAllocator();
716 
717 	// Create an image
718 
719 	const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
720 	const ImageWithMemory image(vk, device, allocator, imageParams, MemoryRequirement::Any);
721 
722 	const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
723 	const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
724 
725 	// Staging buffer (source data for image)
726 
727 	const deUint32 imageArea = multiplyComponents(m_imageSize);
728 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * imageArea;
729 
730 	const BufferWithMemory stagingBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_SRC_BIT), MemoryRequirement::HostVisible);
731 
732 	// Populate the staging buffer with test data
733 	{
734 		de::Random rnd(0xab2c7);
735 		const Allocation& stagingBufferAllocation = stagingBuffer.getAllocation();
736 		deUint32* bufferPtr = static_cast<deUint32*>(stagingBufferAllocation.getHostPtr());
737 		for (deUint32 i = 0; i < imageArea; ++i)
738 			*bufferPtr++ = rnd.getUint32();
739 
740 		flushAlloc(vk, device, stagingBufferAllocation);
741 	}
742 
743 	// Create a buffer to store shader output
744 
745 	const BufferWithMemory outputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
746 
747 	// Create descriptor set
748 
749 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
750 		DescriptorSetLayoutBuilder()
751 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
752 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
753 		.build(vk, device));
754 
755 	const Unique<VkDescriptorPool> descriptorPool(
756 		DescriptorPoolBuilder()
757 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
758 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
759 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
760 
761 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
762 
763 	// Set the bindings
764 
765 	const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
766 	const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, bufferSizeBytes);
767 
768 	DescriptorSetUpdateBuilder()
769 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
770 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
771 		.update(vk, device);
772 
773 	// Perform the computation
774 	{
775 		const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
776 		const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
777 		const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
778 
779 		const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, bufferSizeBytes);
780 		const tcu::IVec2 workSize = m_imageSize / m_localSize;
781 
782 		// Prepare the command buffer
783 
784 		const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
785 		const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
786 
787 		// Start recording commands
788 
789 		beginCommandBuffer(vk, *cmdBuffer);
790 
791 		vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
792 		vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
793 
794 		const std::vector<VkBufferImageCopy> bufferImageCopy(1, makeBufferImageCopy(m_imageSize));
795 		copyBufferToImage(vk, *cmdBuffer, *stagingBuffer, bufferSizeBytes, bufferImageCopy, VK_IMAGE_ASPECT_COLOR_BIT, 1, 1, *image, VK_IMAGE_LAYOUT_GENERAL, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
796 
797 		vk.cmdDispatch(*cmdBuffer, workSize.x(), workSize.y(), 1u);
798 		vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
799 
800 		endCommandBuffer(vk, *cmdBuffer);
801 
802 		// Wait for completion
803 
804 		submitCommandsAndWait(vk, device, queue, *cmdBuffer);
805 	}
806 
807 	// Validate the results
808 
809 	const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
810 	invalidateAlloc(vk, device, outputBufferAllocation);
811 
812 	const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
813 	const deUint32* refBufferPtr = static_cast<deUint32*>(stagingBuffer.getAllocation().getHostPtr());
814 
815 	for (deUint32 ndx = 0; ndx < imageArea; ++ndx)
816 	{
817 		const deUint32 res = *(bufferPtr + ndx);
818 		const deUint32 ref = *(refBufferPtr + ndx);
819 
820 		if (res != ref)
821 		{
822 			std::ostringstream msg;
823 			msg << "Comparison failed for Output.values[" << ndx << "]";
824 			return tcu::TestStatus::fail(msg.str());
825 		}
826 	}
827 	return tcu::TestStatus::pass("Compute succeeded");
828 }
829 
830 class CopySSBOToImageTest : public vkt::TestCase
831 {
832 public:
833 						CopySSBOToImageTest	(tcu::TestContext&	testCtx,
834 											 const std::string&	name,
835 											 const std::string&	description,
836 											 const tcu::IVec2&	localSize,
837 											 const tcu::IVec2&	imageSize);
838 
839 	void				initPrograms		(SourceCollections& sourceCollections) const;
840 	TestInstance*		createInstance		(Context&			context) const;
841 
842 private:
843 	const tcu::IVec2	m_localSize;
844 	const tcu::IVec2	m_imageSize;
845 };
846 
847 class CopySSBOToImageTestInstance : public vkt::TestInstance
848 {
849 public:
850 									CopySSBOToImageTestInstance	(Context&			context,
851 																 const tcu::IVec2&	localSize,
852 																 const tcu::IVec2&	imageSize);
853 
854 	tcu::TestStatus					iterate						(void);
855 
856 private:
857 	const tcu::IVec2				m_localSize;
858 	const tcu::IVec2				m_imageSize;
859 };
860 
CopySSBOToImageTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize)861 CopySSBOToImageTest::CopySSBOToImageTest (tcu::TestContext&		testCtx,
862 										  const std::string&	name,
863 										  const std::string&	description,
864 										  const tcu::IVec2&		localSize,
865 										  const tcu::IVec2&		imageSize)
866 	: TestCase		(testCtx, name, description)
867 	, m_localSize	(localSize)
868 	, m_imageSize	(imageSize)
869 {
870 	DE_ASSERT(m_imageSize.x() % m_localSize.x() == 0);
871 	DE_ASSERT(m_imageSize.y() % m_localSize.y() == 0);
872 }
873 
initPrograms(SourceCollections & sourceCollections) const874 void CopySSBOToImageTest::initPrograms (SourceCollections& sourceCollections) const
875 {
876 	std::ostringstream src;
877 	src << "#version 310 es\n"
878 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ") in;\n"
879 		<< "layout(binding = 1, r32ui) writeonly uniform highp uimage2D u_dstImg;\n"
880 		<< "layout(binding = 0) readonly buffer Input {\n"
881 		<< "    uint values[" << (m_imageSize.x() * m_imageSize.y()) << "];\n"
882 		<< "} sb_in;\n\n"
883 		<< "void main (void) {\n"
884 		<< "    uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
885 		<< "    uint value  = sb_in.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x];\n"
886 		<< "    imageStore(u_dstImg, ivec2(gl_GlobalInvocationID.xy), uvec4(value, 0, 0, 0));\n"
887 		<< "}\n";
888 
889 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
890 }
891 
createInstance(Context & context) const892 TestInstance* CopySSBOToImageTest::createInstance (Context& context) const
893 {
894 	return new CopySSBOToImageTestInstance(context, m_localSize, m_imageSize);
895 }
896 
CopySSBOToImageTestInstance(Context & context,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize)897 CopySSBOToImageTestInstance::CopySSBOToImageTestInstance (Context& context, const tcu::IVec2& localSize, const tcu::IVec2& imageSize)
898 	: TestInstance	(context)
899 	, m_localSize	(localSize)
900 	, m_imageSize	(imageSize)
901 {
902 }
903 
iterate(void)904 tcu::TestStatus CopySSBOToImageTestInstance::iterate (void)
905 {
906 	ContextCommonData		data	= m_context.getContextCommonData();
907 	const DeviceInterface&	vkd		= data.vkd;
908 
909 	// Create an image, a view, and the output buffer
910 	const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
911 	ImageWithBuffer imageWithBuffer(vkd, data.device, data.allocator, vk::makeExtent3D(m_imageSize.x(), m_imageSize.y(), 1),
912 		VK_FORMAT_R32_UINT, VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_STORAGE_BIT, vk::VK_IMAGE_TYPE_2D,
913 		subresourceRange);
914 
915 	const deUint32 imageArea = multiplyComponents(m_imageSize);
916 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * imageArea;
917 
918 	const BufferWithMemory inputBuffer(vkd, data.device, data.allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
919 
920 	// Populate the buffer with test data
921 	{
922 		de::Random rnd(0x77238ac2);
923 		const Allocation& inputBufferAllocation = inputBuffer.getAllocation();
924 		deUint32* bufferPtr = static_cast<deUint32*>(inputBufferAllocation.getHostPtr());
925 		for (deUint32 i = 0; i < imageArea; ++i)
926 			*bufferPtr++ = rnd.getUint32();
927 
928 		flushAlloc(vkd, data.device, inputBufferAllocation);
929 	}
930 
931 	// Create descriptor set
932 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
933 		DescriptorSetLayoutBuilder()
934 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
935 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
936 		.build(vkd, data.device));
937 
938 	const Unique<VkDescriptorPool> descriptorPool(
939 		DescriptorPoolBuilder()
940 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
941 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
942 		.build(vkd, data.device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
943 
944 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vkd, data.device, *descriptorPool, *descriptorSetLayout));
945 
946 	// Set the bindings
947 
948 	const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, imageWithBuffer.getImageView(), VK_IMAGE_LAYOUT_GENERAL);
949 	const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, bufferSizeBytes);
950 
951 	DescriptorSetUpdateBuilder()
952 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
953 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
954 		.update(vkd, data.device);
955 
956 	// Perform the computation
957 	{
958 		const Unique<VkShaderModule> shaderModule(createShaderModule(vkd, data.device, m_context.getBinaryCollection().get("comp"), 0u));
959 		const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vkd, data.device, *descriptorSetLayout));
960 		const Unique<VkPipeline> pipeline(makeComputePipeline(vkd, data.device, *pipelineLayout, *shaderModule));
961 
962 		const VkBufferMemoryBarrier inputBufferPostHostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, bufferSizeBytes);
963 
964 		const VkImageMemoryBarrier imageLayoutBarrier = makeImageMemoryBarrier(
965 			0u, VK_ACCESS_SHADER_WRITE_BIT,
966 			VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
967 			imageWithBuffer.getImage(), subresourceRange);
968 
969 		const tcu::IVec2 workSize = m_imageSize / m_localSize;
970 
971 		// Prepare the command buffer
972 
973 		const Unique<VkCommandPool> cmdPool(makeCommandPool(vkd, data.device, data.qfIndex));
974 		const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vkd, data.device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
975 
976 		// Start recording commands
977 
978 		beginCommandBuffer(vkd, *cmdBuffer);
979 
980 		vkd.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
981 		vkd.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
982 
983 		vkd.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &inputBufferPostHostWriteBarrier, 1, &imageLayoutBarrier);
984 		vkd.cmdDispatch(*cmdBuffer, workSize.x(), workSize.y(), 1u);
985 
986 		copyImageToBuffer(vkd, *cmdBuffer, imageWithBuffer.getImage(), imageWithBuffer.getBuffer(), m_imageSize, VK_ACCESS_SHADER_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL);
987 
988 		endCommandBuffer(vkd, *cmdBuffer);
989 
990 		// Wait for completion
991 
992 		submitCommandsAndWait(vkd, data.device, data.queue, *cmdBuffer);
993 	}
994 
995 	// Validate the results
996 
997 	const Allocation& outputBufferAllocation = imageWithBuffer.getBufferAllocation();
998 	invalidateAlloc(vkd, data.device, outputBufferAllocation);
999 
1000 	const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
1001 	const deUint32* refBufferPtr = static_cast<deUint32*>(inputBuffer.getAllocation().getHostPtr());
1002 
1003 	for (deUint32 ndx = 0; ndx < imageArea; ++ndx)
1004 	{
1005 		const deUint32 res = *(bufferPtr + ndx);
1006 		const deUint32 ref = *(refBufferPtr + ndx);
1007 
1008 		if (res != ref)
1009 		{
1010 			std::ostringstream msg;
1011 			msg << "Comparison failed for pixel " << ndx;
1012 			return tcu::TestStatus::fail(msg.str());
1013 		}
1014 	}
1015 	return tcu::TestStatus::pass("Compute succeeded");
1016 }
1017 
1018 class BufferToBufferInvertTest : public vkt::TestCase
1019 {
1020 public:
1021 	void								initPrograms				(SourceCollections&	sourceCollections) const;
1022 	TestInstance*						createInstance				(Context&			context) const;
1023 
1024 	static BufferToBufferInvertTest*	UBOToSSBOInvertCase			(tcu::TestContext&	testCtx,
1025 																	 const std::string& name,
1026 																	 const std::string& description,
1027 																	 const deUint32		numValues,
1028 																	 const tcu::IVec3&	localSize,
1029 																	 const tcu::IVec3&	workSize);
1030 
1031 	static BufferToBufferInvertTest*	CopyInvertSSBOCase			(tcu::TestContext&	testCtx,
1032 																	 const std::string& name,
1033 																	 const std::string& description,
1034 																	 const deUint32		numValues,
1035 																	 const tcu::IVec3&	localSize,
1036 																	 const tcu::IVec3&	workSize);
1037 
1038 private:
1039 										BufferToBufferInvertTest	(tcu::TestContext&	testCtx,
1040 																	 const std::string& name,
1041 																	 const std::string& description,
1042 																	 const deUint32		numValues,
1043 																	 const tcu::IVec3&	localSize,
1044 																	 const tcu::IVec3&	workSize,
1045 																	 const BufferType	bufferType);
1046 
1047 	const BufferType					m_bufferType;
1048 	const deUint32						m_numValues;
1049 	const tcu::IVec3					m_localSize;
1050 	const tcu::IVec3					m_workSize;
1051 };
1052 
1053 class BufferToBufferInvertTestInstance : public vkt::TestInstance
1054 {
1055 public:
1056 									BufferToBufferInvertTestInstance	(Context&			context,
1057 																		 const deUint32		numValues,
1058 																		 const tcu::IVec3&	localSize,
1059 																		 const tcu::IVec3&	workSize,
1060 																		 const BufferType	bufferType);
1061 
1062 	tcu::TestStatus					iterate								(void);
1063 
1064 private:
1065 	const BufferType				m_bufferType;
1066 	const deUint32					m_numValues;
1067 	const tcu::IVec3				m_localSize;
1068 	const tcu::IVec3				m_workSize;
1069 };
1070 
BufferToBufferInvertTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const BufferType bufferType)1071 BufferToBufferInvertTest::BufferToBufferInvertTest (tcu::TestContext&	testCtx,
1072 													const std::string&	name,
1073 													const std::string&	description,
1074 													const deUint32		numValues,
1075 													const tcu::IVec3&	localSize,
1076 													const tcu::IVec3&	workSize,
1077 													const BufferType	bufferType)
1078 	: TestCase		(testCtx, name, description)
1079 	, m_bufferType	(bufferType)
1080 	, m_numValues	(numValues)
1081 	, m_localSize	(localSize)
1082 	, m_workSize	(workSize)
1083 {
1084 	DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1085 	DE_ASSERT(m_bufferType == BUFFER_TYPE_UNIFORM || m_bufferType == BUFFER_TYPE_SSBO);
1086 }
1087 
UBOToSSBOInvertCase(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)1088 BufferToBufferInvertTest* BufferToBufferInvertTest::UBOToSSBOInvertCase (tcu::TestContext&	testCtx,
1089 																		 const std::string&	name,
1090 																		 const std::string&	description,
1091 																		 const deUint32		numValues,
1092 																		 const tcu::IVec3&	localSize,
1093 																		 const tcu::IVec3&	workSize)
1094 {
1095 	return new BufferToBufferInvertTest(testCtx, name, description, numValues, localSize, workSize, BUFFER_TYPE_UNIFORM);
1096 }
1097 
CopyInvertSSBOCase(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)1098 BufferToBufferInvertTest* BufferToBufferInvertTest::CopyInvertSSBOCase (tcu::TestContext&	testCtx,
1099 																		const std::string&	name,
1100 																		const std::string&	description,
1101 																		const deUint32		numValues,
1102 																		const tcu::IVec3&	localSize,
1103 																		const tcu::IVec3&	workSize)
1104 {
1105 	return new BufferToBufferInvertTest(testCtx, name, description, numValues, localSize, workSize, BUFFER_TYPE_SSBO);
1106 }
1107 
initPrograms(SourceCollections & sourceCollections) const1108 void BufferToBufferInvertTest::initPrograms (SourceCollections& sourceCollections) const
1109 {
1110 	std::ostringstream src;
1111 	if (m_bufferType == BUFFER_TYPE_UNIFORM)
1112 	{
1113 		src << "#version 310 es\n"
1114 			<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1115 			<< "layout(binding = 0) readonly uniform Input {\n"
1116 			<< "    uint values[" << m_numValues << "];\n"
1117 			<< "} ub_in;\n"
1118 			<< "layout(binding = 1, std140) writeonly buffer Output {\n"
1119 			<< "    uint values[" << m_numValues << "];\n"
1120 			<< "} sb_out;\n"
1121 			<< "void main (void) {\n"
1122 			<< "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1123 			<< "    uint numValuesPerInv = uint(ub_in.values.length()) / (size.x*size.y*size.z);\n"
1124 			<< "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1125 			<< "    uint offset          = numValuesPerInv*groupNdx;\n"
1126 			<< "\n"
1127 			<< "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1128 			<< "        sb_out.values[offset + ndx] = ~ub_in.values[offset + ndx];\n"
1129 			<< "}\n";
1130 	}
1131 	else if (m_bufferType == BUFFER_TYPE_SSBO)
1132 	{
1133 		src << "#version 310 es\n"
1134 			<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1135 			<< "layout(binding = 0, std140) readonly buffer Input {\n"
1136 			<< "    uint values[" << m_numValues << "];\n"
1137 			<< "} sb_in;\n"
1138 			<< "layout (binding = 1, std140) writeonly buffer Output {\n"
1139 			<< "    uint values[" << m_numValues << "];\n"
1140 			<< "} sb_out;\n"
1141 			<< "void main (void) {\n"
1142 			<< "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1143 			<< "    uint numValuesPerInv = uint(sb_in.values.length()) / (size.x*size.y*size.z);\n"
1144 			<< "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1145 			<< "    uint offset          = numValuesPerInv*groupNdx;\n"
1146 			<< "\n"
1147 			<< "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1148 			<< "        sb_out.values[offset + ndx] = ~sb_in.values[offset + ndx];\n"
1149 			<< "}\n";
1150 	}
1151 
1152 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1153 }
1154 
createInstance(Context & context) const1155 TestInstance* BufferToBufferInvertTest::createInstance (Context& context) const
1156 {
1157 	return new BufferToBufferInvertTestInstance(context, m_numValues, m_localSize, m_workSize, m_bufferType);
1158 }
1159 
BufferToBufferInvertTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const BufferType bufferType)1160 BufferToBufferInvertTestInstance::BufferToBufferInvertTestInstance (Context&			context,
1161 																	const deUint32		numValues,
1162 																	const tcu::IVec3&	localSize,
1163 																	const tcu::IVec3&	workSize,
1164 																	const BufferType	bufferType)
1165 	: TestInstance	(context)
1166 	, m_bufferType	(bufferType)
1167 	, m_numValues	(numValues)
1168 	, m_localSize	(localSize)
1169 	, m_workSize	(workSize)
1170 {
1171 }
1172 
iterate(void)1173 tcu::TestStatus BufferToBufferInvertTestInstance::iterate (void)
1174 {
1175 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
1176 	const VkDevice			device				= m_context.getDevice();
1177 	const VkQueue			queue				= m_context.getUniversalQueue();
1178 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
1179 	Allocator&				allocator			= m_context.getDefaultAllocator();
1180 
1181 	// Customize the test based on buffer type
1182 
1183 	const VkBufferUsageFlags inputBufferUsageFlags		= (m_bufferType == BUFFER_TYPE_UNIFORM ? VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT : VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
1184 	const VkDescriptorType inputBufferDescriptorType	= (m_bufferType == BUFFER_TYPE_UNIFORM ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1185 	const deUint32 randomSeed							= (m_bufferType == BUFFER_TYPE_UNIFORM ? 0x111223f : 0x124fef);
1186 
1187 	// Create an input buffer
1188 
1189 	const VkDeviceSize bufferSizeBytes = sizeof(tcu::UVec4) * m_numValues;
1190 	const BufferWithMemory inputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, inputBufferUsageFlags), MemoryRequirement::HostVisible);
1191 
1192 	// Fill the input buffer with data
1193 	{
1194 		de::Random rnd(randomSeed);
1195 		const Allocation& inputBufferAllocation = inputBuffer.getAllocation();
1196 		tcu::UVec4* bufferPtr = static_cast<tcu::UVec4*>(inputBufferAllocation.getHostPtr());
1197 		for (deUint32 i = 0; i < m_numValues; ++i)
1198 			bufferPtr[i].x() = rnd.getUint32();
1199 
1200 		flushAlloc(vk, device, inputBufferAllocation);
1201 	}
1202 
1203 	// Create an output buffer
1204 
1205 	const BufferWithMemory outputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1206 
1207 	// Create descriptor set
1208 
1209 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1210 		DescriptorSetLayoutBuilder()
1211 		.addSingleBinding(inputBufferDescriptorType, VK_SHADER_STAGE_COMPUTE_BIT)
1212 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1213 		.build(vk, device));
1214 
1215 	const Unique<VkDescriptorPool> descriptorPool(
1216 		DescriptorPoolBuilder()
1217 		.addType(inputBufferDescriptorType)
1218 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
1219 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1220 
1221 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1222 
1223 	const VkDescriptorBufferInfo inputBufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, bufferSizeBytes);
1224 	const VkDescriptorBufferInfo outputBufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, bufferSizeBytes);
1225 	DescriptorSetUpdateBuilder()
1226 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), inputBufferDescriptorType, &inputBufferDescriptorInfo)
1227 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
1228 		.update(vk, device);
1229 
1230 	// Perform the computation
1231 
1232 	const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
1233 	const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1234 	const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
1235 
1236 	const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, bufferSizeBytes);
1237 
1238 	const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, bufferSizeBytes);
1239 
1240 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1241 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1242 
1243 	// Start recording commands
1244 
1245 	beginCommandBuffer(vk, *cmdBuffer);
1246 
1247 	vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1248 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1249 
1250 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1251 	vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1252 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1253 
1254 	endCommandBuffer(vk, *cmdBuffer);
1255 
1256 	// Wait for completion
1257 
1258 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1259 
1260 	// Validate the results
1261 
1262 	const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1263 	invalidateAlloc(vk, device, outputBufferAllocation);
1264 
1265 	const tcu::UVec4* bufferPtr = static_cast<tcu::UVec4*>(outputBufferAllocation.getHostPtr());
1266 	const tcu::UVec4* refBufferPtr = static_cast<tcu::UVec4*>(inputBuffer.getAllocation().getHostPtr());
1267 
1268 	for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1269 	{
1270 		const deUint32 res = bufferPtr[ndx].x();
1271 		const deUint32 ref = ~refBufferPtr[ndx].x();
1272 
1273 		if (res != ref)
1274 		{
1275 			std::ostringstream msg;
1276 			msg << "Comparison failed for Output.values[" << ndx << "]";
1277 			return tcu::TestStatus::fail(msg.str());
1278 		}
1279 	}
1280 	return tcu::TestStatus::pass("Compute succeeded");
1281 }
1282 
1283 class InvertSSBOInPlaceTest : public vkt::TestCase
1284 {
1285 public:
1286 						InvertSSBOInPlaceTest	(tcu::TestContext&	testCtx,
1287 												 const std::string&	name,
1288 												 const std::string&	description,
1289 												 const deUint32		numValues,
1290 												 const bool			sized,
1291 												 const tcu::IVec3&	localSize,
1292 												 const tcu::IVec3&	workSize);
1293 
1294 
1295 	void				initPrograms			(SourceCollections& sourceCollections) const;
1296 	TestInstance*		createInstance			(Context&			context) const;
1297 
1298 private:
1299 	const deUint32		m_numValues;
1300 	const bool			m_sized;
1301 	const tcu::IVec3	m_localSize;
1302 	const tcu::IVec3	m_workSize;
1303 };
1304 
1305 class InvertSSBOInPlaceTestInstance : public vkt::TestInstance
1306 {
1307 public:
1308 									InvertSSBOInPlaceTestInstance	(Context&			context,
1309 																	 const deUint32		numValues,
1310 																	 const tcu::IVec3&	localSize,
1311 																	 const tcu::IVec3&	workSize);
1312 
1313 	tcu::TestStatus					iterate							(void);
1314 
1315 private:
1316 	const deUint32					m_numValues;
1317 	const tcu::IVec3				m_localSize;
1318 	const tcu::IVec3				m_workSize;
1319 };
1320 
InvertSSBOInPlaceTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const bool sized,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)1321 InvertSSBOInPlaceTest::InvertSSBOInPlaceTest (tcu::TestContext&		testCtx,
1322 											  const std::string&	name,
1323 											  const std::string&	description,
1324 											  const deUint32		numValues,
1325 											  const bool			sized,
1326 											  const tcu::IVec3&		localSize,
1327 											  const tcu::IVec3&		workSize)
1328 	: TestCase		(testCtx, name, description)
1329 	, m_numValues	(numValues)
1330 	, m_sized		(sized)
1331 	, m_localSize	(localSize)
1332 	, m_workSize	(workSize)
1333 {
1334 	DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1335 }
1336 
initPrograms(SourceCollections & sourceCollections) const1337 void InvertSSBOInPlaceTest::initPrograms (SourceCollections& sourceCollections) const
1338 {
1339 	std::ostringstream src;
1340 	src << "#version 310 es\n"
1341 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1342 		<< "layout(binding = 0) buffer InOut {\n"
1343 		<< "    uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1344 		<< "} sb_inout;\n"
1345 		<< "void main (void) {\n"
1346 		<< "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1347 		<< "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
1348 		<< "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1349 		<< "    uint offset          = numValuesPerInv*groupNdx;\n"
1350 		<< "\n"
1351 		<< "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1352 		<< "        sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
1353 		<< "}\n";
1354 
1355 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1356 }
1357 
createInstance(Context & context) const1358 TestInstance* InvertSSBOInPlaceTest::createInstance (Context& context) const
1359 {
1360 	return new InvertSSBOInPlaceTestInstance(context, m_numValues, m_localSize, m_workSize);
1361 }
1362 
InvertSSBOInPlaceTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)1363 InvertSSBOInPlaceTestInstance::InvertSSBOInPlaceTestInstance (Context&			context,
1364 															  const deUint32	numValues,
1365 															  const tcu::IVec3&	localSize,
1366 															  const tcu::IVec3&	workSize)
1367 	: TestInstance	(context)
1368 	, m_numValues	(numValues)
1369 	, m_localSize	(localSize)
1370 	, m_workSize	(workSize)
1371 {
1372 }
1373 
iterate(void)1374 tcu::TestStatus InvertSSBOInPlaceTestInstance::iterate (void)
1375 {
1376 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
1377 	const VkDevice			device				= m_context.getDevice();
1378 	const VkQueue			queue				= m_context.getUniversalQueue();
1379 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
1380 	Allocator&				allocator			= m_context.getDefaultAllocator();
1381 
1382 	// Create an input/output buffer
1383 
1384 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
1385 	const BufferWithMemory buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1386 
1387 	// Fill the buffer with data
1388 
1389 	typedef std::vector<deUint32> data_vector_t;
1390 	data_vector_t inputData(m_numValues);
1391 
1392 	{
1393 		de::Random rnd(0x82ce7f);
1394 		const Allocation& bufferAllocation = buffer.getAllocation();
1395 		deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
1396 		for (deUint32 i = 0; i < m_numValues; ++i)
1397 			inputData[i] = *bufferPtr++ = rnd.getUint32();
1398 
1399 		flushAlloc(vk, device, bufferAllocation);
1400 	}
1401 
1402 	// Create descriptor set
1403 
1404 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1405 		DescriptorSetLayoutBuilder()
1406 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1407 		.build(vk, device));
1408 
1409 	const Unique<VkDescriptorPool> descriptorPool(
1410 		DescriptorPoolBuilder()
1411 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
1412 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1413 
1414 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1415 
1416 	const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
1417 	DescriptorSetUpdateBuilder()
1418 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
1419 		.update(vk, device);
1420 
1421 	// Perform the computation
1422 
1423 	const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
1424 	const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1425 	const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
1426 
1427 	const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer, 0ull, bufferSizeBytes);
1428 
1429 	const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
1430 
1431 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1432 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1433 
1434 	// Start recording commands
1435 
1436 	beginCommandBuffer(vk, *cmdBuffer);
1437 
1438 	vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1439 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1440 
1441 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1442 	vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1443 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1444 
1445 	endCommandBuffer(vk, *cmdBuffer);
1446 
1447 	// Wait for completion
1448 
1449 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1450 
1451 	// Validate the results
1452 
1453 	const Allocation& bufferAllocation = buffer.getAllocation();
1454 	invalidateAlloc(vk, device, bufferAllocation);
1455 
1456 	const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
1457 
1458 	for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1459 	{
1460 		const deUint32 res = bufferPtr[ndx];
1461 		const deUint32 ref = ~inputData[ndx];
1462 
1463 		if (res != ref)
1464 		{
1465 			std::ostringstream msg;
1466 			msg << "Comparison failed for InOut.values[" << ndx << "]";
1467 			return tcu::TestStatus::fail(msg.str());
1468 		}
1469 	}
1470 	return tcu::TestStatus::pass("Compute succeeded");
1471 }
1472 
1473 class WriteToMultipleSSBOTest : public vkt::TestCase
1474 {
1475 public:
1476 						WriteToMultipleSSBOTest	(tcu::TestContext&	testCtx,
1477 												 const std::string&	name,
1478 												 const std::string&	description,
1479 												 const deUint32		numValues,
1480 												 const bool			sized,
1481 												 const tcu::IVec3&	localSize,
1482 												 const tcu::IVec3&	workSize);
1483 
1484 	void				initPrograms			(SourceCollections& sourceCollections) const;
1485 	TestInstance*		createInstance			(Context&			context) const;
1486 
1487 private:
1488 	const deUint32		m_numValues;
1489 	const bool			m_sized;
1490 	const tcu::IVec3	m_localSize;
1491 	const tcu::IVec3	m_workSize;
1492 };
1493 
1494 class WriteToMultipleSSBOTestInstance : public vkt::TestInstance
1495 {
1496 public:
1497 									WriteToMultipleSSBOTestInstance	(Context&			context,
1498 																	 const deUint32		numValues,
1499 																	 const tcu::IVec3&	localSize,
1500 																	 const tcu::IVec3&	workSize);
1501 
1502 	tcu::TestStatus					iterate							(void);
1503 
1504 private:
1505 	const deUint32					m_numValues;
1506 	const tcu::IVec3				m_localSize;
1507 	const tcu::IVec3				m_workSize;
1508 };
1509 
WriteToMultipleSSBOTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const bool sized,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)1510 WriteToMultipleSSBOTest::WriteToMultipleSSBOTest (tcu::TestContext&		testCtx,
1511 												  const std::string&	name,
1512 												  const std::string&	description,
1513 												  const deUint32		numValues,
1514 												  const bool			sized,
1515 												  const tcu::IVec3&		localSize,
1516 												  const tcu::IVec3&		workSize)
1517 	: TestCase		(testCtx, name, description)
1518 	, m_numValues	(numValues)
1519 	, m_sized		(sized)
1520 	, m_localSize	(localSize)
1521 	, m_workSize	(workSize)
1522 {
1523 	DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1524 }
1525 
initPrograms(SourceCollections & sourceCollections) const1526 void WriteToMultipleSSBOTest::initPrograms (SourceCollections& sourceCollections) const
1527 {
1528 	std::ostringstream src;
1529 	src << "#version 310 es\n"
1530 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1531 		<< "layout(binding = 0) writeonly buffer Out0 {\n"
1532 		<< "    uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1533 		<< "} sb_out0;\n"
1534 		<< "layout(binding = 1) writeonly buffer Out1 {\n"
1535 		<< "    uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1536 		<< "} sb_out1;\n"
1537 		<< "void main (void) {\n"
1538 		<< "    uvec3 size      = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1539 		<< "    uint groupNdx   = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1540 		<< "\n"
1541 		<< "    {\n"
1542 		<< "        uint numValuesPerInv = uint(sb_out0.values.length()) / (size.x*size.y*size.z);\n"
1543 		<< "        uint offset          = numValuesPerInv*groupNdx;\n"
1544 		<< "\n"
1545 		<< "        for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1546 		<< "            sb_out0.values[offset + ndx] = offset + ndx;\n"
1547 		<< "    }\n"
1548 		<< "    {\n"
1549 		<< "        uint numValuesPerInv = uint(sb_out1.values.length()) / (size.x*size.y*size.z);\n"
1550 		<< "        uint offset          = numValuesPerInv*groupNdx;\n"
1551 		<< "\n"
1552 		<< "        for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1553 		<< "            sb_out1.values[offset + ndx] = uint(sb_out1.values.length()) - offset - ndx;\n"
1554 		<< "    }\n"
1555 		<< "}\n";
1556 
1557 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1558 }
1559 
createInstance(Context & context) const1560 TestInstance* WriteToMultipleSSBOTest::createInstance (Context& context) const
1561 {
1562 	return new WriteToMultipleSSBOTestInstance(context, m_numValues, m_localSize, m_workSize);
1563 }
1564 
WriteToMultipleSSBOTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)1565 WriteToMultipleSSBOTestInstance::WriteToMultipleSSBOTestInstance (Context&			context,
1566 																  const deUint32	numValues,
1567 																  const tcu::IVec3&	localSize,
1568 																  const tcu::IVec3&	workSize)
1569 	: TestInstance	(context)
1570 	, m_numValues	(numValues)
1571 	, m_localSize	(localSize)
1572 	, m_workSize	(workSize)
1573 {
1574 }
1575 
iterate(void)1576 tcu::TestStatus WriteToMultipleSSBOTestInstance::iterate (void)
1577 {
1578 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
1579 	const VkDevice			device				= m_context.getDevice();
1580 	const VkQueue			queue				= m_context.getUniversalQueue();
1581 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
1582 	Allocator&				allocator			= m_context.getDefaultAllocator();
1583 
1584 	// Create two output buffers
1585 
1586 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
1587 	const BufferWithMemory buffer0(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1588 	const BufferWithMemory buffer1(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1589 
1590 	// Create descriptor set
1591 
1592 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1593 		DescriptorSetLayoutBuilder()
1594 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1595 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1596 		.build(vk, device));
1597 
1598 	const Unique<VkDescriptorPool> descriptorPool(
1599 		DescriptorPoolBuilder()
1600 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2u)
1601 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1602 
1603 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1604 
1605 	const VkDescriptorBufferInfo buffer0DescriptorInfo = makeDescriptorBufferInfo(*buffer0, 0ull, bufferSizeBytes);
1606 	const VkDescriptorBufferInfo buffer1DescriptorInfo = makeDescriptorBufferInfo(*buffer1, 0ull, bufferSizeBytes);
1607 	DescriptorSetUpdateBuilder()
1608 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &buffer0DescriptorInfo)
1609 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &buffer1DescriptorInfo)
1610 		.update(vk, device);
1611 
1612 	// Perform the computation
1613 
1614 	const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
1615 	const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1616 	const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
1617 
1618 	const VkBufferMemoryBarrier shaderWriteBarriers[] =
1619 	{
1620 		makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer0, 0ull, bufferSizeBytes),
1621 		makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer1, 0ull, bufferSizeBytes)
1622 	};
1623 
1624 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1625 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1626 
1627 	// Start recording commands
1628 
1629 	beginCommandBuffer(vk, *cmdBuffer);
1630 
1631 	vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1632 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1633 
1634 	vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1635 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, DE_LENGTH_OF_ARRAY(shaderWriteBarriers), shaderWriteBarriers, 0, (const VkImageMemoryBarrier*)DE_NULL);
1636 
1637 	endCommandBuffer(vk, *cmdBuffer);
1638 
1639 	// Wait for completion
1640 
1641 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1642 
1643 	// Validate the results
1644 	{
1645 		const Allocation& buffer0Allocation = buffer0.getAllocation();
1646 		invalidateAlloc(vk, device, buffer0Allocation);
1647 		const deUint32* buffer0Ptr = static_cast<deUint32*>(buffer0Allocation.getHostPtr());
1648 
1649 		for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1650 		{
1651 			const deUint32 res = buffer0Ptr[ndx];
1652 			const deUint32 ref = ndx;
1653 
1654 			if (res != ref)
1655 			{
1656 				std::ostringstream msg;
1657 				msg << "Comparison failed for Out0.values[" << ndx << "] res=" << res << " ref=" << ref;
1658 				return tcu::TestStatus::fail(msg.str());
1659 			}
1660 		}
1661 	}
1662 	{
1663 		const Allocation& buffer1Allocation = buffer1.getAllocation();
1664 		invalidateAlloc(vk, device, buffer1Allocation);
1665 		const deUint32* buffer1Ptr = static_cast<deUint32*>(buffer1Allocation.getHostPtr());
1666 
1667 		for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1668 		{
1669 			const deUint32 res = buffer1Ptr[ndx];
1670 			const deUint32 ref = m_numValues - ndx;
1671 
1672 			if (res != ref)
1673 			{
1674 				std::ostringstream msg;
1675 				msg << "Comparison failed for Out1.values[" << ndx << "] res=" << res << " ref=" << ref;
1676 				return tcu::TestStatus::fail(msg.str());
1677 			}
1678 		}
1679 	}
1680 	return tcu::TestStatus::pass("Compute succeeded");
1681 }
1682 
1683 class SSBOBarrierTest : public vkt::TestCase
1684 {
1685 public:
1686 						SSBOBarrierTest		(tcu::TestContext&	testCtx,
1687 											 const std::string&	name,
1688 											 const std::string&	description,
1689 											 const tcu::IVec3&	workSize);
1690 
1691 	void				initPrograms		(SourceCollections& sourceCollections) const;
1692 	TestInstance*		createInstance		(Context&			context) const;
1693 
1694 private:
1695 	const tcu::IVec3	m_workSize;
1696 };
1697 
1698 class SSBOBarrierTestInstance : public vkt::TestInstance
1699 {
1700 public:
1701 									SSBOBarrierTestInstance		(Context&			context,
1702 																 const tcu::IVec3&	workSize);
1703 
1704 	tcu::TestStatus					iterate						(void);
1705 
1706 private:
1707 	const tcu::IVec3				m_workSize;
1708 };
1709 
SSBOBarrierTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec3 & workSize)1710 SSBOBarrierTest::SSBOBarrierTest (tcu::TestContext&		testCtx,
1711 								  const std::string&	name,
1712 								  const std::string&	description,
1713 								  const tcu::IVec3&		workSize)
1714 	: TestCase		(testCtx, name, description)
1715 	, m_workSize	(workSize)
1716 {
1717 }
1718 
initPrograms(SourceCollections & sourceCollections) const1719 void SSBOBarrierTest::initPrograms (SourceCollections& sourceCollections) const
1720 {
1721 	sourceCollections.glslSources.add("comp0") << glu::ComputeSource(
1722 		"#version 310 es\n"
1723 		"layout (local_size_x = 1) in;\n"
1724 		"layout(binding = 2) readonly uniform Constants {\n"
1725 		"    uint u_baseVal;\n"
1726 		"};\n"
1727 		"layout(binding = 1) writeonly buffer Output {\n"
1728 		"    uint values[];\n"
1729 		"};\n"
1730 		"void main (void) {\n"
1731 		"    uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
1732 		"    values[offset] = u_baseVal + offset;\n"
1733 		"}\n");
1734 
1735 	sourceCollections.glslSources.add("comp1") << glu::ComputeSource(
1736 		"#version 310 es\n"
1737 		"layout (local_size_x = 1) in;\n"
1738 		"layout(binding = 1) readonly buffer Input {\n"
1739 		"    uint values[];\n"
1740 		"};\n"
1741 		"layout(binding = 0) coherent buffer Output {\n"
1742 		"    uint sum;\n"
1743 		"};\n"
1744 		"void main (void) {\n"
1745 		"    uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
1746 		"    uint value  = values[offset];\n"
1747 		"    atomicAdd(sum, value);\n"
1748 		"}\n");
1749 }
1750 
createInstance(Context & context) const1751 TestInstance* SSBOBarrierTest::createInstance (Context& context) const
1752 {
1753 	return new SSBOBarrierTestInstance(context, m_workSize);
1754 }
1755 
SSBOBarrierTestInstance(Context & context,const tcu::IVec3 & workSize)1756 SSBOBarrierTestInstance::SSBOBarrierTestInstance (Context& context, const tcu::IVec3& workSize)
1757 	: TestInstance	(context)
1758 	, m_workSize	(workSize)
1759 {
1760 }
1761 
iterate(void)1762 tcu::TestStatus SSBOBarrierTestInstance::iterate (void)
1763 {
1764 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
1765 	const VkDevice			device				= m_context.getDevice();
1766 	const VkQueue			queue				= m_context.getUniversalQueue();
1767 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
1768 	Allocator&				allocator			= m_context.getDefaultAllocator();
1769 
1770 	// Create a work buffer used by both shaders
1771 
1772 	const int workGroupCount = multiplyComponents(m_workSize);
1773 	const VkDeviceSize workBufferSizeBytes = sizeof(deUint32) * workGroupCount;
1774 	const BufferWithMemory workBuffer(vk, device, allocator, makeBufferCreateInfo(workBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::Any);
1775 
1776 	// Create an output buffer
1777 
1778 	const VkDeviceSize outputBufferSizeBytes = sizeof(deUint32);
1779 	const BufferWithMemory outputBuffer(vk, device, allocator, makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1780 
1781 	// Initialize atomic counter value to zero
1782 	{
1783 		const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1784 		deUint32* outputBufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
1785 		*outputBufferPtr = 0;
1786 		flushAlloc(vk, device, outputBufferAllocation);
1787 	}
1788 
1789 	// Create a uniform buffer (to pass uniform constants)
1790 
1791 	const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32);
1792 	const BufferWithMemory uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
1793 
1794 	// Set the constants in the uniform buffer
1795 
1796 	const deUint32	baseValue = 127;
1797 	{
1798 		const Allocation& uniformBufferAllocation = uniformBuffer.getAllocation();
1799 		deUint32* uniformBufferPtr = static_cast<deUint32*>(uniformBufferAllocation.getHostPtr());
1800 		uniformBufferPtr[0] = baseValue;
1801 
1802 		flushAlloc(vk, device, uniformBufferAllocation);
1803 	}
1804 
1805 	// Create descriptor set
1806 
1807 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1808 		DescriptorSetLayoutBuilder()
1809 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1810 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1811 		.addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1812 		.build(vk, device));
1813 
1814 	const Unique<VkDescriptorPool> descriptorPool(
1815 		DescriptorPoolBuilder()
1816 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2u)
1817 		.addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
1818 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1819 
1820 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1821 
1822 	const VkDescriptorBufferInfo workBufferDescriptorInfo = makeDescriptorBufferInfo(*workBuffer, 0ull, workBufferSizeBytes);
1823 	const VkDescriptorBufferInfo outputBufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, outputBufferSizeBytes);
1824 	const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
1825 	DescriptorSetUpdateBuilder()
1826 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
1827 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &workBufferDescriptorInfo)
1828 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(2u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
1829 		.update(vk, device);
1830 
1831 	// Perform the computation
1832 
1833 	const Unique<VkShaderModule> shaderModule0(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp0"), 0));
1834 	const Unique<VkShaderModule> shaderModule1(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp1"), 0));
1835 
1836 	const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1837 	const Unique<VkPipeline> pipeline0(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule0));
1838 	const Unique<VkPipeline> pipeline1(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule1));
1839 
1840 	const VkBufferMemoryBarrier writeUniformConstantsBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
1841 
1842 	const VkBufferMemoryBarrier betweenShadersBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *workBuffer, 0ull, workBufferSizeBytes);
1843 
1844 	const VkBufferMemoryBarrier afterComputeBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, outputBufferSizeBytes);
1845 
1846 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1847 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1848 
1849 	// Start recording commands
1850 
1851 	beginCommandBuffer(vk, *cmdBuffer);
1852 
1853 	vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline0);
1854 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1855 
1856 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &writeUniformConstantsBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1857 
1858 	vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1859 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &betweenShadersBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1860 
1861 	// Switch to the second shader program
1862 	vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline1);
1863 
1864 	vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1865 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &afterComputeBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1866 
1867 	endCommandBuffer(vk, *cmdBuffer);
1868 
1869 	// Wait for completion
1870 
1871 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1872 
1873 	// Validate the results
1874 
1875 	const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1876 	invalidateAlloc(vk, device, outputBufferAllocation);
1877 
1878 	const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
1879 	const deUint32	res = *bufferPtr;
1880 	deUint32		ref = 0;
1881 
1882 	for (int ndx = 0; ndx < workGroupCount; ++ndx)
1883 		ref += baseValue + ndx;
1884 
1885 	if (res != ref)
1886 	{
1887 		std::ostringstream msg;
1888 		msg << "ERROR: comparison failed, expected " << ref << ", got " << res;
1889 		return tcu::TestStatus::fail(msg.str());
1890 	}
1891 	return tcu::TestStatus::pass("Compute succeeded");
1892 }
1893 
1894 class ImageAtomicOpTest : public vkt::TestCase
1895 {
1896 public:
1897 						ImageAtomicOpTest		(tcu::TestContext&	testCtx,
1898 												 const std::string& name,
1899 												 const std::string& description,
1900 												 const deUint32		localSize,
1901 												 const tcu::IVec2&	imageSize);
1902 
1903 	void				initPrograms			(SourceCollections& sourceCollections) const;
1904 	TestInstance*		createInstance			(Context&			context) const;
1905 
1906 private:
1907 	const deUint32		m_localSize;
1908 	const tcu::IVec2	m_imageSize;
1909 };
1910 
1911 class ImageAtomicOpTestInstance : public vkt::TestInstance
1912 {
1913 public:
1914 									ImageAtomicOpTestInstance		(Context&			context,
1915 																	 const deUint32		localSize,
1916 																	 const tcu::IVec2&	imageSize);
1917 
1918 	tcu::TestStatus					iterate							(void);
1919 
1920 private:
1921 	const deUint32					m_localSize;
1922 	const tcu::IVec2				m_imageSize;
1923 };
1924 
ImageAtomicOpTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 localSize,const tcu::IVec2 & imageSize)1925 ImageAtomicOpTest::ImageAtomicOpTest (tcu::TestContext&		testCtx,
1926 									  const std::string&	name,
1927 									  const std::string&	description,
1928 									  const deUint32		localSize,
1929 									  const tcu::IVec2&		imageSize)
1930 	: TestCase		(testCtx, name, description)
1931 	, m_localSize	(localSize)
1932 	, m_imageSize	(imageSize)
1933 {
1934 }
1935 
initPrograms(SourceCollections & sourceCollections) const1936 void ImageAtomicOpTest::initPrograms (SourceCollections& sourceCollections) const
1937 {
1938 	std::ostringstream src;
1939 	src << "#version 310 es\n"
1940 		<< "#extension GL_OES_shader_image_atomic : require\n"
1941 		<< "layout (local_size_x = " << m_localSize << ") in;\n"
1942 		<< "layout(binding = 1, r32ui) coherent uniform highp uimage2D u_dstImg;\n"
1943 		<< "layout(binding = 0) readonly buffer Input {\n"
1944 		<< "    uint values[" << (multiplyComponents(m_imageSize) * m_localSize) << "];\n"
1945 		<< "} sb_in;\n\n"
1946 		<< "void main (void) {\n"
1947 		<< "    uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
1948 		<< "    uint value  = sb_in.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x];\n"
1949 		<< "\n"
1950 		<< "    if (gl_LocalInvocationIndex == 0u)\n"
1951 		<< "        imageStore(u_dstImg, ivec2(gl_WorkGroupID.xy), uvec4(0));\n"
1952 		<< "    memoryBarrierImage();\n"
1953 		<< "    barrier();\n"
1954 		<< "    imageAtomicAdd(u_dstImg, ivec2(gl_WorkGroupID.xy), value);\n"
1955 		<< "}\n";
1956 
1957 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1958 }
1959 
createInstance(Context & context) const1960 TestInstance* ImageAtomicOpTest::createInstance (Context& context) const
1961 {
1962 	return new ImageAtomicOpTestInstance(context, m_localSize, m_imageSize);
1963 }
1964 
ImageAtomicOpTestInstance(Context & context,const deUint32 localSize,const tcu::IVec2 & imageSize)1965 ImageAtomicOpTestInstance::ImageAtomicOpTestInstance (Context& context, const deUint32 localSize, const tcu::IVec2& imageSize)
1966 	: TestInstance	(context)
1967 	, m_localSize	(localSize)
1968 	, m_imageSize	(imageSize)
1969 {
1970 }
1971 
iterate(void)1972 tcu::TestStatus ImageAtomicOpTestInstance::iterate (void)
1973 {
1974 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
1975 	const VkDevice			device				= m_context.getDevice();
1976 	const VkQueue			queue				= m_context.getUniversalQueue();
1977 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
1978 	Allocator&				allocator			= m_context.getDefaultAllocator();
1979 
1980 	// Create an image
1981 
1982 	const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
1983 	const ImageWithMemory image(vk, device, allocator, imageParams, MemoryRequirement::Any);
1984 
1985 	const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
1986 	const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
1987 
1988 	// Input buffer
1989 
1990 	const deUint32 numInputValues = multiplyComponents(m_imageSize) * m_localSize;
1991 	const VkDeviceSize inputBufferSizeBytes = sizeof(deUint32) * numInputValues;
1992 
1993 	const BufferWithMemory inputBuffer(vk, device, allocator, makeBufferCreateInfo(inputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1994 
1995 	// Populate the input buffer with test data
1996 	{
1997 		de::Random rnd(0x77238ac2);
1998 		const Allocation& inputBufferAllocation = inputBuffer.getAllocation();
1999 		deUint32* bufferPtr = static_cast<deUint32*>(inputBufferAllocation.getHostPtr());
2000 		for (deUint32 i = 0; i < numInputValues; ++i)
2001 			*bufferPtr++ = rnd.getUint32();
2002 
2003 		flushAlloc(vk, device, inputBufferAllocation);
2004 	}
2005 
2006 	// Create a buffer to store shader output (copied from image data)
2007 
2008 	const deUint32 imageArea = multiplyComponents(m_imageSize);
2009 	const VkDeviceSize outputBufferSizeBytes = sizeof(deUint32) * imageArea;
2010 	const BufferWithMemory outputBuffer(vk, device, allocator, makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT), MemoryRequirement::HostVisible);
2011 
2012 	// Create descriptor set
2013 
2014 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2015 		DescriptorSetLayoutBuilder()
2016 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2017 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
2018 		.build(vk, device));
2019 
2020 	const Unique<VkDescriptorPool> descriptorPool(
2021 		DescriptorPoolBuilder()
2022 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2023 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
2024 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2025 
2026 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2027 
2028 	// Set the bindings
2029 
2030 	const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
2031 	const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, inputBufferSizeBytes);
2032 
2033 	DescriptorSetUpdateBuilder()
2034 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2035 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
2036 		.update(vk, device);
2037 
2038 	// Perform the computation
2039 	{
2040 		const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
2041 		const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2042 		const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
2043 
2044 		const VkBufferMemoryBarrier inputBufferPostHostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, inputBufferSizeBytes);
2045 
2046 		const VkImageMemoryBarrier imageLayoutBarrier = makeImageMemoryBarrier(
2047 			(VkAccessFlags)0, VK_ACCESS_SHADER_WRITE_BIT,
2048 			VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
2049 			*image, subresourceRange);
2050 
2051 		// Prepare the command buffer
2052 
2053 		const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
2054 		const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2055 
2056 		// Start recording commands
2057 
2058 		beginCommandBuffer(vk, *cmdBuffer);
2059 
2060 		vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
2061 		vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2062 
2063 		vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &inputBufferPostHostWriteBarrier, 1, &imageLayoutBarrier);
2064 		vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2065 
2066 		copyImageToBuffer(vk, *cmdBuffer, *image, *outputBuffer, m_imageSize, VK_ACCESS_SHADER_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL);
2067 
2068 		endCommandBuffer(vk, *cmdBuffer);
2069 
2070 		// Wait for completion
2071 
2072 		submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2073 	}
2074 
2075 	// Validate the results
2076 
2077 	const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
2078 	invalidateAlloc(vk, device, outputBufferAllocation);
2079 
2080 	const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
2081 	const deUint32* refBufferPtr = static_cast<deUint32*>(inputBuffer.getAllocation().getHostPtr());
2082 
2083 	for (deUint32 pixelNdx = 0; pixelNdx < imageArea; ++pixelNdx)
2084 	{
2085 		const deUint32	res = bufferPtr[pixelNdx];
2086 		deUint32		ref = 0;
2087 
2088 		for (deUint32 offs = 0; offs < m_localSize; ++offs)
2089 			ref += refBufferPtr[pixelNdx * m_localSize + offs];
2090 
2091 		if (res != ref)
2092 		{
2093 			std::ostringstream msg;
2094 			msg << "Comparison failed for pixel " << pixelNdx;
2095 			return tcu::TestStatus::fail(msg.str());
2096 		}
2097 	}
2098 	return tcu::TestStatus::pass("Compute succeeded");
2099 }
2100 
2101 class ImageBarrierTest : public vkt::TestCase
2102 {
2103 public:
2104 						ImageBarrierTest	(tcu::TestContext&	testCtx,
2105 											const std::string&	name,
2106 											const std::string&	description,
2107 											const tcu::IVec2&	imageSize);
2108 
2109 	void				initPrograms		(SourceCollections& sourceCollections) const;
2110 	TestInstance*		createInstance		(Context&			context) const;
2111 
2112 private:
2113 	const tcu::IVec2	m_imageSize;
2114 };
2115 
2116 class ImageBarrierTestInstance : public vkt::TestInstance
2117 {
2118 public:
2119 									ImageBarrierTestInstance	(Context&			context,
2120 																 const tcu::IVec2&	imageSize);
2121 
2122 	tcu::TestStatus					iterate						(void);
2123 
2124 private:
2125 	const tcu::IVec2				m_imageSize;
2126 };
2127 
ImageBarrierTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec2 & imageSize)2128 ImageBarrierTest::ImageBarrierTest (tcu::TestContext&	testCtx,
2129 									const std::string&	name,
2130 									const std::string&	description,
2131 									const tcu::IVec2&	imageSize)
2132 	: TestCase		(testCtx, name, description)
2133 	, m_imageSize	(imageSize)
2134 {
2135 }
2136 
initPrograms(SourceCollections & sourceCollections) const2137 void ImageBarrierTest::initPrograms (SourceCollections& sourceCollections) const
2138 {
2139 	sourceCollections.glslSources.add("comp0") << glu::ComputeSource(
2140 		"#version 310 es\n"
2141 		"layout (local_size_x = 1) in;\n"
2142 		"layout(binding = 2) readonly uniform Constants {\n"
2143 		"    uint u_baseVal;\n"
2144 		"};\n"
2145 		"layout(binding = 1, r32ui) writeonly uniform highp uimage2D u_img;\n"
2146 		"void main (void) {\n"
2147 		"    uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
2148 		"    imageStore(u_img, ivec2(gl_WorkGroupID.xy), uvec4(offset + u_baseVal, 0, 0, 0));\n"
2149 		"}\n");
2150 
2151 	sourceCollections.glslSources.add("comp1") << glu::ComputeSource(
2152 		"#version 310 es\n"
2153 		"layout (local_size_x = 1) in;\n"
2154 		"layout(binding = 1, r32ui) readonly uniform highp uimage2D u_img;\n"
2155 		"layout(binding = 0) coherent buffer Output {\n"
2156 		"    uint sum;\n"
2157 		"};\n"
2158 		"void main (void) {\n"
2159 		"    uint value = imageLoad(u_img, ivec2(gl_WorkGroupID.xy)).x;\n"
2160 		"    atomicAdd(sum, value);\n"
2161 		"}\n");
2162 }
2163 
createInstance(Context & context) const2164 TestInstance* ImageBarrierTest::createInstance (Context& context) const
2165 {
2166 	return new ImageBarrierTestInstance(context, m_imageSize);
2167 }
2168 
ImageBarrierTestInstance(Context & context,const tcu::IVec2 & imageSize)2169 ImageBarrierTestInstance::ImageBarrierTestInstance (Context& context, const tcu::IVec2& imageSize)
2170 	: TestInstance	(context)
2171 	, m_imageSize	(imageSize)
2172 {
2173 }
2174 
iterate(void)2175 tcu::TestStatus ImageBarrierTestInstance::iterate (void)
2176 {
2177 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
2178 	const VkDevice			device				= m_context.getDevice();
2179 	const VkQueue			queue				= m_context.getUniversalQueue();
2180 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
2181 	Allocator&				allocator			= m_context.getDefaultAllocator();
2182 
2183 	// Create an image used by both shaders
2184 
2185 	const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_STORAGE_BIT);
2186 	const ImageWithMemory image(vk, device, allocator, imageParams, MemoryRequirement::Any);
2187 
2188 	const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
2189 	const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
2190 
2191 	// Create an output buffer
2192 
2193 	const VkDeviceSize outputBufferSizeBytes = sizeof(deUint32);
2194 	const BufferWithMemory outputBuffer(vk, device, allocator, makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
2195 
2196 	// Initialize atomic counter value to zero
2197 	{
2198 		const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
2199 		deUint32* outputBufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
2200 		*outputBufferPtr = 0;
2201 		flushAlloc(vk, device, outputBufferAllocation);
2202 	}
2203 
2204 	// Create a uniform buffer (to pass uniform constants)
2205 
2206 	const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32);
2207 	const BufferWithMemory uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
2208 
2209 	// Set the constants in the uniform buffer
2210 
2211 	const deUint32	baseValue = 127;
2212 	{
2213 		const Allocation& uniformBufferAllocation = uniformBuffer.getAllocation();
2214 		deUint32* uniformBufferPtr = static_cast<deUint32*>(uniformBufferAllocation.getHostPtr());
2215 		uniformBufferPtr[0] = baseValue;
2216 
2217 		flushAlloc(vk, device, uniformBufferAllocation);
2218 	}
2219 
2220 	// Create descriptor set
2221 
2222 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2223 		DescriptorSetLayoutBuilder()
2224 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2225 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
2226 		.addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2227 		.build(vk, device));
2228 
2229 	const Unique<VkDescriptorPool> descriptorPool(
2230 		DescriptorPoolBuilder()
2231 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2232 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
2233 		.addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2234 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2235 
2236 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2237 
2238 	const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
2239 	const VkDescriptorBufferInfo outputBufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, outputBufferSizeBytes);
2240 	const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2241 	DescriptorSetUpdateBuilder()
2242 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
2243 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
2244 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(2u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2245 		.update(vk, device);
2246 
2247 	// Perform the computation
2248 
2249 	const Unique<VkShaderModule>	shaderModule0(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp0"), 0));
2250 	const Unique<VkShaderModule>	shaderModule1(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp1"), 0));
2251 
2252 	const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2253 	const Unique<VkPipeline> pipeline0(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule0));
2254 	const Unique<VkPipeline> pipeline1(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule1));
2255 
2256 	const VkBufferMemoryBarrier writeUniformConstantsBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2257 
2258 	const VkImageMemoryBarrier imageLayoutBarrier = makeImageMemoryBarrier(
2259 		0u, 0u,
2260 		VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
2261 		*image, subresourceRange);
2262 
2263 	const VkImageMemoryBarrier imageBarrierBetweenShaders = makeImageMemoryBarrier(
2264 		VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT,
2265 		VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_GENERAL,
2266 		*image, subresourceRange);
2267 
2268 	const VkBufferMemoryBarrier afterComputeBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, outputBufferSizeBytes);
2269 
2270 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
2271 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2272 
2273 	// Start recording commands
2274 
2275 	beginCommandBuffer(vk, *cmdBuffer);
2276 
2277 	vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline0);
2278 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2279 
2280 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &writeUniformConstantsBarrier, 1, &imageLayoutBarrier);
2281 
2282 	vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2283 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 0, (const VkBufferMemoryBarrier*)DE_NULL, 1, &imageBarrierBetweenShaders);
2284 
2285 	// Switch to the second shader program
2286 	vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline1);
2287 
2288 	vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2289 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &afterComputeBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2290 
2291 	endCommandBuffer(vk, *cmdBuffer);
2292 
2293 	// Wait for completion
2294 
2295 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2296 
2297 	// Validate the results
2298 
2299 	const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
2300 	invalidateAlloc(vk, device, outputBufferAllocation);
2301 
2302 	const int		numValues = multiplyComponents(m_imageSize);
2303 	const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
2304 	const deUint32	res = *bufferPtr;
2305 	deUint32		ref = 0;
2306 
2307 	for (int ndx = 0; ndx < numValues; ++ndx)
2308 		ref += baseValue + ndx;
2309 
2310 	if (res != ref)
2311 	{
2312 		std::ostringstream msg;
2313 		msg << "ERROR: comparison failed, expected " << ref << ", got " << res;
2314 		return tcu::TestStatus::fail(msg.str());
2315 	}
2316 	return tcu::TestStatus::pass("Compute succeeded");
2317 }
2318 
2319 class ComputeTestInstance : public vkt::TestInstance
2320 {
2321 public:
ComputeTestInstance(Context & context)2322 		ComputeTestInstance		(Context& context)
2323 		: TestInstance			(context)
2324 		, m_numPhysDevices		(1)
2325 		, m_queueFamilyIndex	(0)
2326 	{
2327 		createDeviceGroup();
2328 	}
2329 
~ComputeTestInstance()2330 		~ComputeTestInstance	()
2331 	{
2332 	}
2333 
2334 	void							createDeviceGroup	(void);
getDeviceInterface(void)2335 	const vk::DeviceInterface&		getDeviceInterface	(void)			{ return *m_deviceDriver; }
getInstance(void)2336 	vk::VkInstance					getInstance			(void)			{ return m_deviceGroupInstance; }
getDevice(void)2337 	vk::VkDevice					getDevice			(void)			{ return *m_logicalDevice; }
getPhysicalDevice(deUint32 i=0)2338 	vk::VkPhysicalDevice			getPhysicalDevice	(deUint32 i = 0){ return m_physicalDevices[i]; }
2339 
2340 protected:
2341 	deUint32							m_numPhysDevices;
2342 	deUint32							m_queueFamilyIndex;
2343 
2344 private:
2345 	CustomInstance						m_deviceGroupInstance;
2346 	vk::Move<vk::VkDevice>				m_logicalDevice;
2347 	std::vector<vk::VkPhysicalDevice>	m_physicalDevices;
2348 #ifndef CTS_USES_VULKANSC
2349 	de::MovePtr<vk::DeviceDriver>		m_deviceDriver;
2350 #else
2351 	de::MovePtr<vk::DeviceDriverSC, vk::DeinitDeviceDeleter>	m_deviceDriver;
2352 #endif // CTS_USES_VULKANSC
2353 };
2354 
createDeviceGroup(void)2355 void ComputeTestInstance::createDeviceGroup (void)
2356 {
2357 	const tcu::CommandLine&							cmdLine					= m_context.getTestContext().getCommandLine();
2358 	const deUint32									devGroupIdx				= cmdLine.getVKDeviceGroupId() - 1;
2359 	const deUint32									physDeviceIdx			= cmdLine.getVKDeviceId() - 1;
2360 	const float										queuePriority			= 1.0f;
2361 	const std::vector<std::string>					requiredExtensions		(1, "VK_KHR_device_group_creation");
2362 	m_deviceGroupInstance													= createCustomInstanceWithExtensions(m_context, requiredExtensions);
2363 	std::vector<VkPhysicalDeviceGroupProperties>	devGroupProperties		= enumeratePhysicalDeviceGroups(m_context.getInstanceInterface(), m_deviceGroupInstance);
2364 	m_numPhysDevices														= devGroupProperties[devGroupIdx].physicalDeviceCount;
2365 	std::vector<const char*>						deviceExtensions;
2366 
2367 	if (!isCoreDeviceExtension(m_context.getUsedApiVersion(), "VK_KHR_device_group"))
2368 		deviceExtensions.push_back("VK_KHR_device_group");
2369 
2370 	VkDeviceGroupDeviceCreateInfo					deviceGroupInfo			=
2371 	{
2372 		VK_STRUCTURE_TYPE_DEVICE_GROUP_DEVICE_CREATE_INFO,									//stype
2373 		DE_NULL,																			//pNext
2374 		devGroupProperties[devGroupIdx].physicalDeviceCount,								//physicalDeviceCount
2375 		devGroupProperties[devGroupIdx].physicalDevices										//physicalDevices
2376 	};
2377 	const InstanceDriver&							instance				(m_deviceGroupInstance.getDriver());
2378 	const VkPhysicalDeviceFeatures					deviceFeatures			= getPhysicalDeviceFeatures(instance, deviceGroupInfo.pPhysicalDevices[physDeviceIdx]);
2379 	const std::vector<VkQueueFamilyProperties>		queueProps				= getPhysicalDeviceQueueFamilyProperties(instance, devGroupProperties[devGroupIdx].physicalDevices[physDeviceIdx]);
2380 
2381 	m_physicalDevices.resize(m_numPhysDevices);
2382 	for (deUint32 physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2383 		m_physicalDevices[physDevIdx] = devGroupProperties[devGroupIdx].physicalDevices[physDevIdx];
2384 
2385 	for (size_t queueNdx = 0; queueNdx < queueProps.size(); queueNdx++)
2386 	{
2387 		if (queueProps[queueNdx].queueFlags & VK_QUEUE_COMPUTE_BIT)
2388 			m_queueFamilyIndex = (deUint32)queueNdx;
2389 	}
2390 
2391 	VkDeviceQueueCreateInfo							queueInfo				=
2392 	{
2393 		VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,		// VkStructureType					sType;
2394 		DE_NULL,										// const void*						pNext;
2395 		(VkDeviceQueueCreateFlags)0u,					// VkDeviceQueueCreateFlags			flags;
2396 		m_queueFamilyIndex,								// deUint32							queueFamilyIndex;
2397 		1u,												// deUint32							queueCount;
2398 		&queuePriority									// const float*						pQueuePriorities;
2399 	};
2400 
2401 	void* pNext												= &deviceGroupInfo;
2402 #ifdef CTS_USES_VULKANSC
2403 	VkDeviceObjectReservationCreateInfo memReservationInfo	= cmdLine.isSubProcess() ? m_context.getResourceInterface()->getStatMax() : resetDeviceObjectReservationCreateInfo();
2404 	memReservationInfo.pNext								= pNext;
2405 	pNext													= &memReservationInfo;
2406 
2407 	VkPhysicalDeviceVulkanSC10Features sc10Features			= createDefaultSC10Features();
2408 	sc10Features.pNext										= pNext;
2409 	pNext													= &sc10Features;
2410 	VkPipelineCacheCreateInfo			pcCI;
2411 	std::vector<VkPipelinePoolSize>		poolSizes;
2412 	if (cmdLine.isSubProcess())
2413 	{
2414 		if (m_context.getResourceInterface()->getCacheDataSize() > 0)
2415 		{
2416 			pcCI =
2417 			{
2418 				VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO,			// VkStructureType				sType;
2419 				DE_NULL,												// const void*					pNext;
2420 				VK_PIPELINE_CACHE_CREATE_READ_ONLY_BIT |
2421 					VK_PIPELINE_CACHE_CREATE_USE_APPLICATION_STORAGE_BIT,	// VkPipelineCacheCreateFlags	flags;
2422 				m_context.getResourceInterface()->getCacheDataSize(),	// deUintptr					initialDataSize;
2423 				m_context.getResourceInterface()->getCacheData()		// const void*					pInitialData;
2424 			};
2425 			memReservationInfo.pipelineCacheCreateInfoCount		= 1;
2426 			memReservationInfo.pPipelineCacheCreateInfos		= &pcCI;
2427 		}
2428 
2429 		poolSizes							= m_context.getResourceInterface()->getPipelinePoolSizes();
2430 		if (!poolSizes.empty())
2431 		{
2432 			memReservationInfo.pipelinePoolSizeCount		= deUint32(poolSizes.size());
2433 			memReservationInfo.pPipelinePoolSizes			= poolSizes.data();
2434 		}
2435 	}
2436 
2437 #endif // CTS_USES_VULKANSC
2438 
2439 	const VkDeviceCreateInfo						deviceInfo				=
2440 	{
2441 		VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,							// VkStructureType					sType;
2442 		pNext,															// const void*						pNext;
2443 		(VkDeviceCreateFlags)0,											// VkDeviceCreateFlags				flags;
2444 		1u	,															// uint32_t							queueCreateInfoCount;
2445 		&queueInfo,														// const VkDeviceQueueCreateInfo*	pQueueCreateInfos;
2446 		0u,																// uint32_t							enabledLayerCount;
2447 		DE_NULL,														// const char* const*				ppEnabledLayerNames;
2448 		deUint32(deviceExtensions.size()),								// uint32_t							enabledExtensionCount;
2449 		(deviceExtensions.empty() ? DE_NULL : &deviceExtensions[0]),	// const char* const*				ppEnabledExtensionNames;
2450 		&deviceFeatures,												// const VkPhysicalDeviceFeatures*	pEnabledFeatures;
2451 	};
2452 
2453 	m_logicalDevice		= createCustomDevice(m_context.getTestContext().getCommandLine().isValidationEnabled(), m_context.getPlatformInterface(), m_deviceGroupInstance, instance, deviceGroupInfo.pPhysicalDevices[physDeviceIdx], &deviceInfo);
2454 #ifndef CTS_USES_VULKANSC
2455 	m_deviceDriver = de::MovePtr<DeviceDriver>(new DeviceDriver(m_context.getPlatformInterface(), m_deviceGroupInstance, *m_logicalDevice));
2456 #else
2457 	m_deviceDriver = de::MovePtr<DeviceDriverSC, DeinitDeviceDeleter>(new DeviceDriverSC(m_context.getPlatformInterface(), m_context.getInstance(), *m_logicalDevice, m_context.getTestContext().getCommandLine(), m_context.getResourceInterface(), m_context.getDeviceVulkanSC10Properties(), m_context.getDeviceProperties()), vk::DeinitDeviceDeleter(m_context.getResourceInterface().get(), *m_logicalDevice));
2458 #endif // CTS_USES_VULKANSC
2459 }
2460 
2461 class DispatchBaseTest : public vkt::TestCase
2462 {
2463 public:
2464 						DispatchBaseTest	(tcu::TestContext&	testCtx,
2465 											const std::string&	name,
2466 											const std::string&	description,
2467 											const deUint32		numValues,
2468 											const tcu::IVec3&	localsize,
2469 											const tcu::IVec3&	worksize,
2470 											const tcu::IVec3&	splitsize);
2471 
2472 	void				initPrograms		(SourceCollections& sourceCollections) const;
2473 	TestInstance*		createInstance		(Context&			context) const;
2474 
2475 private:
2476 	const deUint32					m_numValues;
2477 	const tcu::IVec3				m_localSize;
2478 	const tcu::IVec3				m_workSize;
2479 	const tcu::IVec3				m_splitSize;
2480 };
2481 
2482 class DispatchBaseTestInstance : public ComputeTestInstance
2483 {
2484 public:
2485 									DispatchBaseTestInstance	(Context&			context,
2486 																const deUint32		numValues,
2487 																const tcu::IVec3&	localsize,
2488 																const tcu::IVec3&	worksize,
2489 																const tcu::IVec3&	splitsize);
2490 
2491 	bool							isInputVectorValid			(const tcu::IVec3& small, const tcu::IVec3& big);
2492 	tcu::TestStatus					iterate						(void);
2493 
2494 private:
2495 	const deUint32					m_numValues;
2496 	const tcu::IVec3				m_localSize;
2497 	const tcu::IVec3				m_workSize;
2498 	const tcu::IVec3				m_splitWorkSize;
2499 };
2500 
DispatchBaseTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize,const tcu::IVec3 & splitsize)2501 DispatchBaseTest::DispatchBaseTest (tcu::TestContext&	testCtx,
2502 									const std::string&	name,
2503 									const std::string&	description,
2504 									const deUint32		numValues,
2505 									const tcu::IVec3&	localsize,
2506 									const tcu::IVec3&	worksize,
2507 									const tcu::IVec3&	splitsize)
2508 	: TestCase		(testCtx, name, description)
2509 	, m_numValues	(numValues)
2510 	, m_localSize	(localsize)
2511 	, m_workSize	(worksize)
2512 	, m_splitSize	(splitsize)
2513 {
2514 }
2515 
initPrograms(SourceCollections & sourceCollections) const2516 void DispatchBaseTest::initPrograms (SourceCollections& sourceCollections) const
2517 {
2518 	std::ostringstream src;
2519 	src << "#version 310 es\n"
2520 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
2521 
2522 		<< "layout(binding = 0) buffer InOut {\n"
2523 		<< "    uint values[" << de::toString(m_numValues) << "];\n"
2524 		<< "} sb_inout;\n"
2525 
2526 		<< "layout(binding = 1) readonly uniform uniformInput {\n"
2527 		<< "    uvec3 gridSize;\n"
2528 		<< "} ubo_in;\n"
2529 
2530 		<< "void main (void) {\n"
2531 		<< "    uvec3 size = ubo_in.gridSize * gl_WorkGroupSize;\n"
2532 		<< "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
2533 		<< "    uint index = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
2534 		<< "    uint offset = numValuesPerInv*index;\n"
2535 		<< "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
2536 		<< "        sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
2537 		<< "}\n";
2538 
2539 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
2540 }
2541 
createInstance(Context & context) const2542 TestInstance* DispatchBaseTest::createInstance (Context& context) const
2543 {
2544 	return new DispatchBaseTestInstance(context, m_numValues, m_localSize, m_workSize, m_splitSize);
2545 }
2546 
DispatchBaseTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize,const tcu::IVec3 & splitsize)2547 DispatchBaseTestInstance::DispatchBaseTestInstance (Context& context,
2548 													const deUint32		numValues,
2549 													const tcu::IVec3&	localsize,
2550 													const tcu::IVec3&	worksize,
2551 													const tcu::IVec3&	splitsize)
2552 
2553 	: ComputeTestInstance	(context)
2554 	, m_numValues			(numValues)
2555 	, m_localSize			(localsize)
2556 	, m_workSize			(worksize)
2557 	, m_splitWorkSize		(splitsize)
2558 {
2559 	// For easy work distribution across physical devices:
2560 	// WorkSize should be a multiple of SplitWorkSize only in the X component
2561 	if ((!isInputVectorValid(m_splitWorkSize, m_workSize)) ||
2562 		(m_workSize.x() <= m_splitWorkSize.x()) ||
2563 		(m_workSize.y() != m_splitWorkSize.y()) ||
2564 		(m_workSize.z() != m_splitWorkSize.z()))
2565 		TCU_THROW(TestError, "Invalid Input.");
2566 
2567 	// For easy work distribution within the same physical device:
2568 	// SplitWorkSize should be a multiple of localSize in Y or Z component
2569 	if ((!isInputVectorValid(m_localSize, m_splitWorkSize)) ||
2570 		(m_localSize.x() != m_splitWorkSize.x()) ||
2571 		(m_localSize.y() >= m_splitWorkSize.y()) ||
2572 		(m_localSize.z() >= m_splitWorkSize.z()))
2573 		TCU_THROW(TestError, "Invalid Input.");
2574 
2575 	if ((multiplyComponents(m_workSize) / multiplyComponents(m_splitWorkSize)) < (deInt32) m_numPhysDevices)
2576 		TCU_THROW(TestError, "Not enough work to distribute across all physical devices.");
2577 
2578 	deUint32 totalWork = multiplyComponents(m_workSize) * multiplyComponents(m_localSize);
2579 	if ((totalWork > numValues) || (numValues % totalWork != 0))
2580 		TCU_THROW(TestError, "Buffer too small/not aligned to cover all values.");
2581 }
2582 
isInputVectorValid(const tcu::IVec3 & small,const tcu::IVec3 & big)2583 bool DispatchBaseTestInstance::isInputVectorValid(const tcu::IVec3& small, const tcu::IVec3& big)
2584 {
2585 	if (((big.x() < small.x()) || (big.y() < small.y()) || (big.z() < small.z())) ||
2586 		((big.x() % small.x() != 0) || (big.y() % small.y() != 0) || (big.z() % small.z() != 0)))
2587 		return false;
2588 	return true;
2589 }
2590 
iterate(void)2591 tcu::TestStatus DispatchBaseTestInstance::iterate (void)
2592 {
2593 	const DeviceInterface&	vk					= getDeviceInterface();
2594 	const VkDevice			device				= getDevice();
2595 	const VkQueue			queue				= getDeviceQueue(vk, device, m_queueFamilyIndex, 0);
2596 	SimpleAllocator			allocator			(vk, device, getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice()));
2597 	deUint32				totalWorkloadSize	= 0;
2598 
2599 	// Create an uniform and input/output buffer
2600 	const deUint32 uniformBufSize = 3; // Pass the compute grid size
2601 	const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32) * uniformBufSize;
2602 	const BufferWithMemory uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
2603 
2604 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
2605 	const BufferWithMemory buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
2606 
2607 	// Fill the buffers with data
2608 	typedef std::vector<deUint32> data_vector_t;
2609 	data_vector_t uniformInputData(uniformBufSize);
2610 	data_vector_t inputData(m_numValues);
2611 
2612 	{
2613 		const Allocation& bufferAllocation = uniformBuffer.getAllocation();
2614 		deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2615 		uniformInputData[0] = *bufferPtr++ = m_workSize.x();
2616 		uniformInputData[1] = *bufferPtr++ = m_workSize.y();
2617 		uniformInputData[2] = *bufferPtr++ = m_workSize.z();
2618 		flushAlloc(vk, device, bufferAllocation);
2619 	}
2620 
2621 	{
2622 		de::Random rnd(0x82ce7f);
2623 		const Allocation& bufferAllocation = buffer.getAllocation();
2624 		deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2625 		for (deUint32 i = 0; i < m_numValues; ++i)
2626 			inputData[i] = *bufferPtr++ = rnd.getUint32();
2627 
2628 		flushAlloc(vk, device, bufferAllocation);
2629 	}
2630 
2631 	// Create descriptor set
2632 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2633 		DescriptorSetLayoutBuilder()
2634 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2635 		.addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2636 		.build(vk, device));
2637 
2638 	const Unique<VkDescriptorPool> descriptorPool(
2639 		DescriptorPoolBuilder()
2640 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2641 		.addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2642 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2643 
2644 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2645 
2646 	const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
2647 	const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2648 
2649 	DescriptorSetUpdateBuilder()
2650 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2651 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2652 		.update(vk, device);
2653 
2654 	const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
2655 	const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2656 	const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, static_cast<VkPipelineCreateFlags>(VK_PIPELINE_CREATE_DISPATCH_BASE), *shaderModule, static_cast<VkPipelineShaderStageCreateFlags>(0u)));
2657 
2658 	const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer, 0ull, bufferSizeBytes);
2659 	const VkBufferMemoryBarrier hostUniformWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2660 
2661 	const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
2662 
2663 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, m_queueFamilyIndex));
2664 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2665 
2666 	// Start recording commands
2667 	beginCommandBuffer(vk, *cmdBuffer);
2668 
2669 	vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
2670 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2671 
2672 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostUniformWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2673 
2674 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2675 
2676 	// Split the workload across all physical devices based on m_splitWorkSize.x()
2677 	for (deUint32 physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2678 	{
2679 		deUint32 baseGroupX = physDevIdx * m_splitWorkSize.x();
2680 		deUint32 baseGroupY = 0;
2681 		deUint32 baseGroupZ = 0;
2682 
2683 		// Split the workload within the physical device based on m_localSize.y() and m_localSize.z()
2684 		for (deInt32 localIdxY = 0; localIdxY < (m_splitWorkSize.y() / m_localSize.y()); localIdxY++)
2685 		{
2686 			for (deInt32 localIdxZ = 0; localIdxZ < (m_splitWorkSize.z() / m_localSize.z()); localIdxZ++)
2687 			{
2688 				deUint32 offsetX = baseGroupX;
2689 				deUint32 offsetY = baseGroupY + localIdxY * m_localSize.y();
2690 				deUint32 offsetZ = baseGroupZ + localIdxZ * m_localSize.z();
2691 
2692 				deUint32 localSizeX = (physDevIdx == (m_numPhysDevices - 1)) ? m_workSize.x() - baseGroupX : m_localSize.x();
2693 				deUint32 localSizeY = m_localSize.y();
2694 				deUint32 localSizeZ = m_localSize.z();
2695 
2696 				totalWorkloadSize += (localSizeX * localSizeY * localSizeZ);
2697 				vk.cmdDispatchBase(*cmdBuffer, offsetX, offsetY, offsetZ, localSizeX, localSizeY, localSizeZ);
2698 			}
2699 		}
2700 	}
2701 
2702 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2703 
2704 	endCommandBuffer(vk, *cmdBuffer);
2705 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2706 
2707 	if (totalWorkloadSize != deUint32(multiplyComponents(m_workSize)))
2708 		TCU_THROW(TestError, "Not covering the entire workload.");
2709 
2710 	// Validate the results
2711 	const Allocation& bufferAllocation = buffer.getAllocation();
2712 	invalidateAlloc(vk, device, bufferAllocation);
2713 	const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2714 
2715 	for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
2716 	{
2717 		const deUint32 res = bufferPtr[ndx];
2718 		const deUint32 ref = ~inputData[ndx];
2719 
2720 		if (res != ref)
2721 		{
2722 			std::ostringstream msg;
2723 			msg << "Comparison failed for InOut.values[" << ndx << "]";
2724 			return tcu::TestStatus::fail(msg.str());
2725 		}
2726 	}
2727 	return tcu::TestStatus::pass("Compute succeeded");
2728 }
2729 
2730 class DeviceIndexTest : public vkt::TestCase
2731 {
2732 public:
2733 	DeviceIndexTest		(tcu::TestContext&	testCtx,
2734 											const std::string&	name,
2735 											const std::string&	description,
2736 											const deUint32		numValues,
2737 											const tcu::IVec3&	localsize,
2738 											const tcu::IVec3&	splitsize);
2739 
2740 	void				initPrograms		(SourceCollections& sourceCollections) const;
2741 	TestInstance*		createInstance		(Context&			context) const;
2742 
2743 private:
2744 	const deUint32					m_numValues;
2745 	const tcu::IVec3				m_localSize;
2746 	const tcu::IVec3				m_workSize;
2747 	const tcu::IVec3				m_splitSize;
2748 };
2749 
2750 class DeviceIndexTestInstance : public ComputeTestInstance
2751 {
2752 public:
2753 									DeviceIndexTestInstance	(Context&			context,
2754 																const deUint32		numValues,
2755 																const tcu::IVec3&	localsize,
2756 																const tcu::IVec3&	worksize);
2757 	tcu::TestStatus					iterate						(void);
2758 private:
2759 	const deUint32					m_numValues;
2760 	const tcu::IVec3				m_localSize;
2761 	tcu::IVec3						m_workSize;
2762 };
2763 
DeviceIndexTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize)2764 DeviceIndexTest::DeviceIndexTest (tcu::TestContext&	testCtx,
2765 									const std::string&	name,
2766 									const std::string&	description,
2767 									const deUint32		numValues,
2768 									const tcu::IVec3&	localsize,
2769 									const tcu::IVec3&	worksize)
2770 	: TestCase		(testCtx, name, description)
2771 	, m_numValues	(numValues)
2772 	, m_localSize	(localsize)
2773 	, m_workSize	(worksize)
2774 {
2775 }
2776 
initPrograms(SourceCollections & sourceCollections) const2777 void DeviceIndexTest::initPrograms (SourceCollections& sourceCollections) const
2778 {
2779 	std::ostringstream src;
2780 	src << "#version 310 es\n"
2781 		<< "#extension GL_EXT_device_group : require\n"
2782 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
2783 
2784 		<< "layout(binding = 0) buffer InOut {\n"
2785 		<< "    uint values[" << de::toString(m_numValues) << "];\n"
2786 		<< "} sb_inout;\n"
2787 
2788 		<< "layout(binding = 1) readonly uniform uniformInput {\n"
2789 		<< "    uint baseOffset[1+" << VK_MAX_DEVICE_GROUP_SIZE << "];\n"
2790 		<< "} ubo_in;\n"
2791 
2792 		<< "void main (void) {\n"
2793 		<< "    uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
2794 		<< "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
2795 		<< "    uint index = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
2796 		<< "    uint offset = numValuesPerInv*index;\n"
2797 		<< "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
2798 		<< "        sb_inout.values[offset + ndx] = ubo_in.baseOffset[0] + ubo_in.baseOffset[gl_DeviceIndex + 1];\n"
2799 		<< "}\n";
2800 
2801 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
2802 }
2803 
createInstance(Context & context) const2804 TestInstance* DeviceIndexTest::createInstance (Context& context) const
2805 {
2806 	return new DeviceIndexTestInstance(context, m_numValues, m_localSize, m_workSize);
2807 }
2808 
DeviceIndexTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize)2809 DeviceIndexTestInstance::DeviceIndexTestInstance (Context& context,
2810 													const deUint32		numValues,
2811 													const tcu::IVec3&	localsize,
2812 													const tcu::IVec3&	worksize)
2813 
2814 	: ComputeTestInstance	(context)
2815 	, m_numValues			(numValues)
2816 	, m_localSize			(localsize)
2817 	, m_workSize			(worksize)
2818 {}
2819 
iterate(void)2820 tcu::TestStatus DeviceIndexTestInstance::iterate (void)
2821 {
2822 	const DeviceInterface&			vk					= getDeviceInterface();
2823 	const VkDevice					device				= getDevice();
2824 	const VkQueue					queue				= getDeviceQueue(vk, device, m_queueFamilyIndex, 0);
2825 	SimpleAllocator					allocator			(vk, device, getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice()));
2826 	const deUint32					allocDeviceMask		= (1 << m_numPhysDevices) - 1;
2827 	de::Random						rnd					(0x82ce7f);
2828 	Move<VkBuffer>					sboBuffer;
2829 	vk::Move<vk::VkDeviceMemory>	sboBufferMemory;
2830 
2831 	// Create an uniform and output buffer
2832 	const deUint32 uniformBufSize = 4 * (1 + VK_MAX_DEVICE_GROUP_SIZE);
2833 	const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32) * uniformBufSize;
2834 	const BufferWithMemory uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
2835 
2836 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
2837 	const BufferWithMemory checkBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT), MemoryRequirement::HostVisible);
2838 
2839 	// create SBO buffer
2840 	{
2841 		const VkBufferCreateInfo	sboBufferParams =
2842 		{
2843 			VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,									// sType
2844 			DE_NULL,																// pNext
2845 			0u,																		// flags
2846 			(VkDeviceSize)bufferSizeBytes,											// size
2847 			VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT,	// usage
2848 			VK_SHARING_MODE_EXCLUSIVE,												// sharingMode
2849 			1u,																		// queueFamilyIndexCount
2850 			&m_queueFamilyIndex,														// pQueueFamilyIndices
2851 		};
2852 		sboBuffer = createBuffer(vk, device, &sboBufferParams);
2853 
2854 		VkMemoryRequirements memReqs = getBufferMemoryRequirements(vk, device, sboBuffer.get());
2855 		deUint32 memoryTypeNdx = 0;
2856 		const VkPhysicalDeviceMemoryProperties deviceMemProps = getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice());
2857 		for ( memoryTypeNdx = 0; memoryTypeNdx < deviceMemProps.memoryTypeCount; memoryTypeNdx++)
2858 		{
2859 			if ((memReqs.memoryTypeBits & (1u << memoryTypeNdx)) != 0 &&
2860 				(deviceMemProps.memoryTypes[memoryTypeNdx].propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) == VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)
2861 				break;
2862 		}
2863 		if (memoryTypeNdx == deviceMemProps.memoryTypeCount)
2864 			TCU_THROW(NotSupportedError, "No compatible memory type found");
2865 
2866 		const VkMemoryAllocateFlagsInfo allocDeviceMaskInfo =
2867 		{
2868 			VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO,		// sType
2869 			DE_NULL,											// pNext
2870 			VK_MEMORY_ALLOCATE_DEVICE_MASK_BIT,					// flags
2871 			allocDeviceMask,									// deviceMask
2872 		};
2873 
2874 		VkMemoryAllocateInfo		allocInfo =
2875 		{
2876 			VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,			// sType
2877 			&allocDeviceMaskInfo,							// pNext
2878 			memReqs.size,									// allocationSize
2879 			memoryTypeNdx,									// memoryTypeIndex
2880 		};
2881 
2882 		sboBufferMemory = allocateMemory(vk, device, &allocInfo);
2883 		VK_CHECK(vk.bindBufferMemory(device, *sboBuffer, sboBufferMemory.get(), 0));
2884 	}
2885 
2886 	// Fill the buffers with data
2887 	typedef std::vector<deUint32> data_vector_t;
2888 	data_vector_t uniformInputData(uniformBufSize, 0);
2889 
2890 	{
2891 		const Allocation& bufferAllocation = uniformBuffer.getAllocation();
2892 		deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2893 		for (deUint32 i = 0; i < uniformBufSize; ++i)
2894 			uniformInputData[i] = *bufferPtr++ = rnd.getUint32() / 10; // divide to prevent overflow in addition
2895 
2896 		flushAlloc(vk, device, bufferAllocation);
2897 	}
2898 
2899 	// Create descriptor set
2900 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2901 		DescriptorSetLayoutBuilder()
2902 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2903 		.addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2904 		.build(vk, device));
2905 
2906 	const Unique<VkDescriptorPool> descriptorPool(
2907 		DescriptorPoolBuilder()
2908 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2909 		.addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2910 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2911 
2912 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2913 
2914 	const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*sboBuffer, 0ull, bufferSizeBytes);
2915 	const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2916 
2917 	DescriptorSetUpdateBuilder()
2918 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2919 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2920 		.update(vk, device);
2921 
2922 	const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
2923 	const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2924 	const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
2925 
2926 	const VkBufferMemoryBarrier hostUniformWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2927 	const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT , *sboBuffer, 0ull, bufferSizeBytes);
2928 
2929 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, m_queueFamilyIndex));
2930 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2931 
2932 	// Verify multiple device masks
2933 	for (deUint32 physDevMask = 1; physDevMask < (1u << m_numPhysDevices); physDevMask++)
2934 	{
2935 		deUint32 constantValPerLoop = 0;
2936 		{
2937 			const Allocation& bufferAllocation = uniformBuffer.getAllocation();
2938 			deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2939 			constantValPerLoop = *bufferPtr = rnd.getUint32() / 10;  // divide to prevent overflow in addition
2940 			flushAlloc(vk, device, bufferAllocation);
2941 		}
2942 		beginCommandBuffer(vk, *cmdBuffer);
2943 
2944 		vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
2945 		vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2946 		vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostUniformWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2947 
2948 		vk.cmdSetDeviceMask(*cmdBuffer, physDevMask);
2949 		vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
2950 
2951 		vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2952 
2953 		endCommandBuffer(vk, *cmdBuffer);
2954 		submitCommandsAndWait(vk, device, queue, *cmdBuffer, true, physDevMask);
2955 		m_context.resetCommandPoolForVKSC(device, *cmdPool);
2956 
2957 		// Validate the results on all physical devices where compute shader was launched
2958 		const VkBufferMemoryBarrier srcBufferBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT , *sboBuffer, 0ull, bufferSizeBytes);
2959 		const VkBufferMemoryBarrier dstBufferBarrier = makeBufferMemoryBarrier(VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *checkBuffer, 0ull, bufferSizeBytes);
2960 		const VkBufferCopy	copyParams =
2961 		{
2962 			(VkDeviceSize)0u,						// srcOffset
2963 			(VkDeviceSize)0u,						// dstOffset
2964 			bufferSizeBytes							// size
2965 		};
2966 
2967 		for (deUint32 physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2968 		{
2969 			if (!(1<<physDevIdx & physDevMask))
2970 				continue;
2971 
2972 			const deUint32 deviceMask = 1 << physDevIdx;
2973 
2974 			beginCommandBuffer(vk, *cmdBuffer);
2975 			vk.cmdSetDeviceMask(*cmdBuffer, deviceMask);
2976 			vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT , VK_PIPELINE_STAGE_TRANSFER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &srcBufferBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2977 			vk.cmdCopyBuffer(*cmdBuffer, *sboBuffer, *checkBuffer, 1, &copyParams);
2978 			vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &dstBufferBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2979 
2980 			endCommandBuffer(vk, *cmdBuffer);
2981 			submitCommandsAndWait(vk, device, queue, *cmdBuffer, true, deviceMask);
2982 
2983 			const Allocation& bufferAllocation = checkBuffer.getAllocation();
2984 			invalidateAlloc(vk, device, bufferAllocation);
2985 			const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2986 
2987 			for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
2988 			{
2989 				const deUint32 res = bufferPtr[ndx];
2990 				const deUint32 ref = constantValPerLoop + uniformInputData[4 * (physDevIdx + 1)];
2991 
2992 				if (res != ref)
2993 				{
2994 					std::ostringstream msg;
2995 					msg << "Comparison failed on physical device "<< getPhysicalDevice(physDevIdx) <<" ( deviceMask "<< deviceMask <<" ) for InOut.values[" << ndx << "]";
2996 					return tcu::TestStatus::fail(msg.str());
2997 				}
2998 			}
2999 		}
3000 	}
3001 
3002 	return tcu::TestStatus::pass("Compute succeeded");
3003 }
3004 
3005 class ConcurrentCompute : public vkt::TestCase
3006 {
3007 public:
3008 						ConcurrentCompute	(tcu::TestContext&	testCtx,
3009 											 const std::string&	name,
3010 											 const std::string&	description);
3011 
3012 
3013 	void				initPrograms		(SourceCollections& sourceCollections) const;
3014 	TestInstance*		createInstance		(Context&			context) const;
3015 };
3016 
3017 class ConcurrentComputeInstance : public vkt::TestInstance
3018 {
3019 public:
3020 									ConcurrentComputeInstance	(Context& context);
3021 
3022 	tcu::TestStatus					iterate						(void);
3023 };
3024 
ConcurrentCompute(tcu::TestContext & testCtx,const std::string & name,const std::string & description)3025 ConcurrentCompute::ConcurrentCompute (tcu::TestContext&	testCtx,
3026 									  const std::string&	name,
3027 									  const std::string&	description)
3028 	: TestCase		(testCtx, name, description)
3029 {
3030 }
3031 
initPrograms(SourceCollections & sourceCollections) const3032 void ConcurrentCompute::initPrograms (SourceCollections& sourceCollections) const
3033 {
3034 	std::ostringstream src;
3035 	src << "#version 310 es\n"
3036 		<< "layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;\n"
3037 		<< "layout(binding = 0) buffer InOut {\n"
3038 		<< "    uint values[1024];\n"
3039 		<< "} sb_inout;\n"
3040 		<< "void main (void) {\n"
3041 		<< "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
3042 		<< "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
3043 		<< "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
3044 		<< "    uint offset          = numValuesPerInv*groupNdx;\n"
3045 		<< "\n"
3046 		<< "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
3047 		<< "        sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
3048 		<< "}\n";
3049 
3050 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
3051 }
3052 
createInstance(Context & context) const3053 TestInstance* ConcurrentCompute::createInstance (Context& context) const
3054 {
3055 	return new ConcurrentComputeInstance(context);
3056 }
3057 
ConcurrentComputeInstance(Context & context)3058 ConcurrentComputeInstance::ConcurrentComputeInstance (Context& context)
3059 	: TestInstance	(context)
3060 {
3061 }
3062 
iterate(void)3063 tcu::TestStatus ConcurrentComputeInstance::iterate (void)
3064 {
3065 	enum {
3066 		NO_MATCH_FOUND	= ~((deUint32)0),
3067 		ERROR_NONE		= 0,
3068 		ERROR_WAIT		= 1,
3069 		ERROR_ORDER		= 2
3070 	};
3071 
3072 	struct Queues
3073 	{
3074 		VkQueue		queue;
3075 		deUint32	queueFamilyIndex;
3076 	};
3077 
3078 //	const DeviceInterface&					vk							= m_context.getDeviceInterface();
3079 	const deUint32							numValues					= 1024;
3080 	const CustomInstance					instance					(createCustomInstanceFromContext(m_context));
3081 	const InstanceDriver&					instanceDriver				(instance.getDriver());
3082 	const VkPhysicalDevice					physicalDevice				= chooseDevice(instanceDriver, instance, m_context.getTestContext().getCommandLine());
3083 	tcu::TestLog&							log							= m_context.getTestContext().getLog();
3084 	vk::Move<vk::VkDevice>					logicalDevice;
3085 	std::vector<VkQueueFamilyProperties>	queueFamilyProperties;
3086 	VkDeviceCreateInfo						deviceInfo;
3087 	VkPhysicalDeviceFeatures				deviceFeatures;
3088 	const float								queuePriorities[2]			= {1.0f, 0.0f};
3089 	VkDeviceQueueCreateInfo					queueInfos[2];
3090 	Queues									queues[2]					=
3091 																		{
3092 																			{DE_NULL, (deUint32)NO_MATCH_FOUND},
3093 																			{DE_NULL, (deUint32)NO_MATCH_FOUND}
3094 																		};
3095 
3096 	queueFamilyProperties = getPhysicalDeviceQueueFamilyProperties(instanceDriver, physicalDevice);
3097 
3098 	for (deUint32 queueNdx = 0; queueNdx < queueFamilyProperties.size(); ++queueNdx)
3099 	{
3100 		if (queueFamilyProperties[queueNdx].queueFlags & VK_QUEUE_COMPUTE_BIT)
3101 		{
3102 			if (NO_MATCH_FOUND == queues[0].queueFamilyIndex)
3103 				queues[0].queueFamilyIndex = queueNdx;
3104 
3105 			if (queues[0].queueFamilyIndex != queueNdx || queueFamilyProperties[queueNdx].queueCount > 1u)
3106 			{
3107 				queues[1].queueFamilyIndex = queueNdx;
3108 				break;
3109 			}
3110 		}
3111 	}
3112 
3113 	if (queues[0].queueFamilyIndex == NO_MATCH_FOUND || queues[1].queueFamilyIndex == NO_MATCH_FOUND)
3114 		TCU_THROW(NotSupportedError, "Queues couldn't be created");
3115 
3116 	for (int queueNdx = 0; queueNdx < 2; ++queueNdx)
3117 	{
3118 		VkDeviceQueueCreateInfo queueInfo;
3119 		deMemset(&queueInfo, 0, sizeof(queueInfo));
3120 
3121 		queueInfo.sType				= VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
3122 		queueInfo.pNext				= DE_NULL;
3123 		queueInfo.flags				= (VkDeviceQueueCreateFlags)0u;
3124 		queueInfo.queueFamilyIndex	= queues[queueNdx].queueFamilyIndex;
3125 		queueInfo.queueCount		= (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex) ? 2 : 1;
3126 		queueInfo.pQueuePriorities	= (queueInfo.queueCount == 2) ? queuePriorities : &queuePriorities[queueNdx];
3127 
3128 		queueInfos[queueNdx]		= queueInfo;
3129 
3130 		if (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex)
3131 			break;
3132 	}
3133 
3134 	void* pNext												= DE_NULL;
3135 #ifdef CTS_USES_VULKANSC
3136 	VkDeviceObjectReservationCreateInfo memReservationInfo	= m_context.getTestContext().getCommandLine().isSubProcess() ? m_context.getResourceInterface()->getStatMax() : resetDeviceObjectReservationCreateInfo();
3137 	memReservationInfo.pNext								= pNext;
3138 	pNext													= &memReservationInfo;
3139 
3140 	VkPhysicalDeviceVulkanSC10Features sc10Features			= createDefaultSC10Features();
3141 	sc10Features.pNext										= pNext;
3142 	pNext													= &sc10Features;
3143 
3144 	VkPipelineCacheCreateInfo			pcCI;
3145 	std::vector<VkPipelinePoolSize>		poolSizes;
3146 	if (m_context.getTestContext().getCommandLine().isSubProcess())
3147 	{
3148 		if (m_context.getResourceInterface()->getCacheDataSize() > 0)
3149 		{
3150 			pcCI =
3151 			{
3152 				VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO,			// VkStructureType				sType;
3153 				DE_NULL,												// const void*					pNext;
3154 				VK_PIPELINE_CACHE_CREATE_READ_ONLY_BIT |
3155 					VK_PIPELINE_CACHE_CREATE_USE_APPLICATION_STORAGE_BIT,	// VkPipelineCacheCreateFlags	flags;
3156 				m_context.getResourceInterface()->getCacheDataSize(),	// deUintptr					initialDataSize;
3157 				m_context.getResourceInterface()->getCacheData()		// const void*					pInitialData;
3158 			};
3159 			memReservationInfo.pipelineCacheCreateInfoCount		= 1;
3160 			memReservationInfo.pPipelineCacheCreateInfos		= &pcCI;
3161 		}
3162 
3163 		poolSizes							= m_context.getResourceInterface()->getPipelinePoolSizes();
3164 		if (!poolSizes.empty())
3165 		{
3166 			memReservationInfo.pipelinePoolSizeCount			= deUint32(poolSizes.size());
3167 			memReservationInfo.pPipelinePoolSizes				= poolSizes.data();
3168 		}
3169 	}
3170 #endif // CTS_USES_VULKANSC
3171 
3172 	deMemset(&deviceInfo, 0, sizeof(deviceInfo));
3173 	instanceDriver.getPhysicalDeviceFeatures(physicalDevice, &deviceFeatures);
3174 
3175 	deviceInfo.sType					= VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
3176 	deviceInfo.pNext					= pNext;
3177 	deviceInfo.enabledExtensionCount	= 0u;
3178 	deviceInfo.ppEnabledExtensionNames	= DE_NULL;
3179 	deviceInfo.enabledLayerCount		= 0u;
3180 	deviceInfo.ppEnabledLayerNames		= DE_NULL;
3181 	deviceInfo.pEnabledFeatures			= &deviceFeatures;
3182 	deviceInfo.queueCreateInfoCount		= (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex) ? 1 : 2;
3183 	deviceInfo.pQueueCreateInfos		= queueInfos;
3184 
3185 	logicalDevice = createCustomDevice	(m_context.getTestContext().getCommandLine().isValidationEnabled(), m_context.getPlatformInterface(), instance, instanceDriver, physicalDevice, &deviceInfo);
3186 
3187 #ifndef CTS_USES_VULKANSC
3188 	de::MovePtr<vk::DeviceDriver>	deviceDriver = de::MovePtr<DeviceDriver>(new DeviceDriver(m_context.getPlatformInterface(), instance, *logicalDevice));
3189 #else
3190 	de::MovePtr<vk::DeviceDriverSC, vk::DeinitDeviceDeleter>	deviceDriver = de::MovePtr<DeviceDriverSC, DeinitDeviceDeleter>(new DeviceDriverSC(m_context.getPlatformInterface(), instance, *logicalDevice, m_context.getTestContext().getCommandLine(), m_context.getResourceInterface(), m_context.getDeviceVulkanSC10Properties(), m_context.getDeviceProperties()), vk::DeinitDeviceDeleter(m_context.getResourceInterface().get(), *logicalDevice));
3191 #endif // CTS_USES_VULKANSC
3192 	vk::DeviceInterface& vk = *deviceDriver;
3193 
3194 	for (deUint32 queueReqNdx = 0; queueReqNdx < 2; ++queueReqNdx)
3195 	{
3196 		if (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex)
3197 			vk.getDeviceQueue(*logicalDevice, queues[queueReqNdx].queueFamilyIndex, queueReqNdx, &queues[queueReqNdx].queue);
3198 		else
3199 			vk.getDeviceQueue(*logicalDevice, queues[queueReqNdx].queueFamilyIndex, 0u, &queues[queueReqNdx].queue);
3200 	}
3201 
3202 	// Create an input/output buffers
3203 	const VkPhysicalDeviceMemoryProperties memoryProperties	= vk::getPhysicalDeviceMemoryProperties(instanceDriver, physicalDevice);
3204 
3205 	de::MovePtr<SimpleAllocator> allocator					= de::MovePtr<SimpleAllocator>(new SimpleAllocator(vk, *logicalDevice, memoryProperties));
3206 	const VkDeviceSize bufferSizeBytes						= sizeof(deUint32) * numValues;
3207 	const BufferWithMemory buffer1(vk, *logicalDevice, *allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
3208 	const BufferWithMemory buffer2(vk, *logicalDevice, *allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
3209 
3210 	// Fill the buffers with data
3211 
3212 	typedef std::vector<deUint32> data_vector_t;
3213 	data_vector_t inputData(numValues);
3214 
3215 	{
3216 		de::Random rnd(0x82ce7f);
3217 		const Allocation& bufferAllocation1	= buffer1.getAllocation();
3218 		const Allocation& bufferAllocation2	= buffer2.getAllocation();
3219 		deUint32* bufferPtr1				= static_cast<deUint32*>(bufferAllocation1.getHostPtr());
3220 		deUint32* bufferPtr2				= static_cast<deUint32*>(bufferAllocation2.getHostPtr());
3221 
3222 		for (deUint32 i = 0; i < numValues; ++i)
3223 		{
3224 			deUint32 val = rnd.getUint32();
3225 			inputData[i] = val;
3226 			*bufferPtr1++ = val;
3227 			*bufferPtr2++ = val;
3228 		}
3229 
3230 		flushAlloc(vk, *logicalDevice, bufferAllocation1);
3231 		flushAlloc(vk, *logicalDevice, bufferAllocation2);
3232 	}
3233 
3234 	// Create descriptor sets
3235 
3236 	const Unique<VkDescriptorSetLayout>	descriptorSetLayout1(
3237 		DescriptorSetLayoutBuilder()
3238 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3239 		.build(vk, *logicalDevice));
3240 
3241 	const Unique<VkDescriptorPool>		descriptorPool1(
3242 		DescriptorPoolBuilder()
3243 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
3244 		.build(vk, *logicalDevice, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
3245 
3246 	const Unique<VkDescriptorSet>		descriptorSet1(makeDescriptorSet(vk, *logicalDevice, *descriptorPool1, *descriptorSetLayout1));
3247 
3248 	const VkDescriptorBufferInfo		bufferDescriptorInfo1	= makeDescriptorBufferInfo(*buffer1, 0ull, bufferSizeBytes);
3249 		DescriptorSetUpdateBuilder()
3250 		.writeSingle(*descriptorSet1, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo1)
3251 		.update(vk, *logicalDevice);
3252 
3253 	const Unique<VkDescriptorSetLayout>	descriptorSetLayout2(
3254 		DescriptorSetLayoutBuilder()
3255 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3256 		.build(vk, *logicalDevice));
3257 
3258 	const Unique<VkDescriptorPool>		descriptorPool2(
3259 		DescriptorPoolBuilder()
3260 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
3261 		.build(vk, *logicalDevice, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
3262 
3263 	const Unique<VkDescriptorSet>		descriptorSet2(makeDescriptorSet(vk, *logicalDevice, *descriptorPool2, *descriptorSetLayout2));
3264 
3265 	const VkDescriptorBufferInfo		bufferDescriptorInfo2	= makeDescriptorBufferInfo(*buffer2, 0ull, bufferSizeBytes);
3266 		DescriptorSetUpdateBuilder()
3267 		.writeSingle(*descriptorSet2, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo2)
3268 		.update(vk, *logicalDevice);
3269 
3270 	// Perform the computation
3271 
3272 	const Unique<VkShaderModule>		shaderModule(createShaderModule(vk, *logicalDevice, m_context.getBinaryCollection().get("comp"), 0u));
3273 
3274 	const Unique<VkPipelineLayout>		pipelineLayout1(makePipelineLayout(vk, *logicalDevice, *descriptorSetLayout1));
3275 	const Unique<VkPipeline>			pipeline1(makeComputePipeline(vk, *logicalDevice, *pipelineLayout1, *shaderModule));
3276 	const VkBufferMemoryBarrier			hostWriteBarrier1		= makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer1, 0ull, bufferSizeBytes);
3277 	const VkBufferMemoryBarrier			shaderWriteBarrier1		= makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer1, 0ull, bufferSizeBytes);
3278 	const Unique<VkCommandPool>			cmdPool1(makeCommandPool(vk, *logicalDevice, queues[0].queueFamilyIndex));
3279 	const Unique<VkCommandBuffer>		cmdBuffer1(allocateCommandBuffer(vk, *logicalDevice, *cmdPool1, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3280 
3281 	const Unique<VkPipelineLayout>		pipelineLayout2(makePipelineLayout(vk, *logicalDevice, *descriptorSetLayout2));
3282 	const Unique<VkPipeline>			pipeline2(makeComputePipeline(vk, *logicalDevice, *pipelineLayout2, *shaderModule));
3283 	const VkBufferMemoryBarrier			hostWriteBarrier2		= makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer2, 0ull, bufferSizeBytes);
3284 	const VkBufferMemoryBarrier			shaderWriteBarrier2		= makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer2, 0ull, bufferSizeBytes);
3285 	const Unique<VkCommandPool>			cmdPool2(makeCommandPool(vk, *logicalDevice, queues[1].queueFamilyIndex));
3286 	const Unique<VkCommandBuffer>		cmdBuffer2(allocateCommandBuffer(vk, *logicalDevice, *cmdPool2, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3287 
3288 	// Command buffer 1
3289 
3290 	beginCommandBuffer(vk, *cmdBuffer1);
3291 	vk.cmdBindPipeline(*cmdBuffer1, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline1);
3292 	vk.cmdBindDescriptorSets(*cmdBuffer1, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout1, 0u, 1u, &descriptorSet1.get(), 0u, DE_NULL);
3293 	vk.cmdPipelineBarrier(*cmdBuffer1, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier1, 0, (const VkImageMemoryBarrier*)DE_NULL);
3294 	vk.cmdDispatch(*cmdBuffer1, 1, 1, 1);
3295 	vk.cmdPipelineBarrier(*cmdBuffer1, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier1, 0, (const VkImageMemoryBarrier*)DE_NULL);
3296 	endCommandBuffer(vk, *cmdBuffer1);
3297 
3298 	// Command buffer 2
3299 
3300 	beginCommandBuffer(vk, *cmdBuffer2);
3301 	vk.cmdBindPipeline(*cmdBuffer2, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline2);
3302 	vk.cmdBindDescriptorSets(*cmdBuffer2, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout2, 0u, 1u, &descriptorSet2.get(), 0u, DE_NULL);
3303 	vk.cmdPipelineBarrier(*cmdBuffer2, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier2, 0, (const VkImageMemoryBarrier*)DE_NULL);
3304 	vk.cmdDispatch(*cmdBuffer2, 1, 1, 1);
3305 	vk.cmdPipelineBarrier(*cmdBuffer2, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier2, 0, (const VkImageMemoryBarrier*)DE_NULL);
3306 	endCommandBuffer(vk, *cmdBuffer2);
3307 
3308 	VkSubmitInfo	submitInfo1 =
3309 	{
3310 		VK_STRUCTURE_TYPE_SUBMIT_INFO,			// sType
3311 		DE_NULL,								// pNext
3312 		0u,										// waitSemaphoreCount
3313 		DE_NULL,								// pWaitSemaphores
3314 		(const VkPipelineStageFlags*)DE_NULL,	// pWaitDstStageMask
3315 		1u,										// commandBufferCount
3316 		&cmdBuffer1.get(),						// pCommandBuffers
3317 		0u,										// signalSemaphoreCount
3318 		DE_NULL									// pSignalSemaphores
3319 	};
3320 
3321 	VkSubmitInfo	submitInfo2 =
3322 	{
3323 		VK_STRUCTURE_TYPE_SUBMIT_INFO,			// sType
3324 		DE_NULL,								// pNext
3325 		0u,										// waitSemaphoreCount
3326 		DE_NULL,								// pWaitSemaphores
3327 		(const VkPipelineStageFlags*)DE_NULL,	// pWaitDstStageMask
3328 		1u,										// commandBufferCount
3329 		&cmdBuffer2.get(),						// pCommandBuffers
3330 		0u,										// signalSemaphoreCount
3331 		DE_NULL									// pSignalSemaphores
3332 	};
3333 
3334 	// Wait for completion
3335 	const Unique<VkFence>	fence1(createFence(vk, *logicalDevice));
3336 	const Unique<VkFence>	fence2(createFence(vk, *logicalDevice));
3337 
3338 	VK_CHECK(vk.queueSubmit(queues[0].queue, 1u, &submitInfo1, *fence1));
3339 	VK_CHECK(vk.queueSubmit(queues[1].queue, 1u, &submitInfo2, *fence2));
3340 
3341 	int err = ERROR_NONE;
3342 
3343 	// First wait for the low-priority queue
3344 	if (VK_SUCCESS != vk.waitForFences(*logicalDevice, 1u, &fence2.get(), DE_TRUE, ~0ull))
3345 		err = ERROR_WAIT;
3346 
3347 	// If the high-priority queue hasn't finished, we have a problem.
3348 	if (VK_SUCCESS != vk.getFenceStatus(*logicalDevice, fence1.get()))
3349 		if (err == ERROR_NONE)
3350 			err = ERROR_ORDER;
3351 
3352 	// Wait for the high-priority fence so we don't get errors on teardown.
3353 	vk.waitForFences(*logicalDevice, 1u, &fence1.get(), DE_TRUE, ~0ull);
3354 
3355 	// If we fail() before waiting for all of the fences, error will come from
3356 	// teardown instead of the error we want.
3357 
3358 	if (err == ERROR_WAIT)
3359 	{
3360 		return tcu::TestStatus::fail("Failed waiting for low-priority queue fence.");
3361 	}
3362 
3363 	// Validate the results
3364 
3365 	const Allocation& bufferAllocation1	= buffer1.getAllocation();
3366 	invalidateAlloc(vk, *logicalDevice, bufferAllocation1);
3367 	const deUint32* bufferPtr1			= static_cast<deUint32*>(bufferAllocation1.getHostPtr());
3368 
3369 	const Allocation& bufferAllocation2	= buffer2.getAllocation();
3370 	invalidateAlloc(vk, *logicalDevice, bufferAllocation2);
3371 	const deUint32* bufferPtr2			= static_cast<deUint32*>(bufferAllocation2.getHostPtr());
3372 
3373 	for (deUint32 ndx = 0; ndx < numValues; ++ndx)
3374 	{
3375 		const deUint32 res1	= bufferPtr1[ndx];
3376 		const deUint32 res2	= bufferPtr2[ndx];
3377 		const deUint32 inp	= inputData[ndx];
3378 		const deUint32 ref	= ~inp;
3379 
3380 		if (res1 != ref || res1 != res2)
3381 		{
3382 			std::ostringstream msg;
3383 			msg << "Comparison failed for InOut.values[" << ndx << "] ref:" << ref <<" res1:" << res1 << " res2:" << res2 << " inp:" << inp;
3384 			return tcu::TestStatus::fail(msg.str());
3385 		}
3386 	}
3387 
3388 	if (err == ERROR_ORDER)
3389 		log << tcu::TestLog::Message << "Note: Low-priority queue was faster than high-priority one. This is not an error, but priorities may be inverted." << tcu::TestLog::EndMessage;
3390 
3391 	return tcu::TestStatus::pass("Test passed");
3392 }
3393 
3394 class EmptyWorkGroupCase : public vkt::TestCase
3395 {
3396 public:
3397 					EmptyWorkGroupCase		(tcu::TestContext& testCtx, const std::string& name, const std::string& description, const tcu::UVec3& dispatchSize);
~EmptyWorkGroupCase(void)3398 	virtual			~EmptyWorkGroupCase		(void) {}
3399 
3400 	TestInstance*	createInstance			(Context& context) const override;
3401 	void			initPrograms			(vk::SourceCollections& programCollection) const override;
3402 
3403 protected:
3404 	const tcu::UVec3 m_dispatchSize;
3405 };
3406 
3407 class EmptyWorkGroupInstance : public vkt::TestInstance
3408 {
3409 public:
EmptyWorkGroupInstance(Context & context,const tcu::UVec3 & dispatchSize)3410 						EmptyWorkGroupInstance	(Context& context, const tcu::UVec3& dispatchSize)
3411 							: vkt::TestInstance	(context)
3412 							, m_dispatchSize	(dispatchSize)
3413 							{}
~EmptyWorkGroupInstance(void)3414 	virtual				~EmptyWorkGroupInstance	(void) {}
3415 
3416 	tcu::TestStatus		iterate					(void) override;
3417 
3418 protected:
3419 	const tcu::UVec3 m_dispatchSize;
3420 };
3421 
EmptyWorkGroupCase(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::UVec3 & dispatchSize)3422 EmptyWorkGroupCase::EmptyWorkGroupCase (tcu::TestContext& testCtx, const std::string& name, const std::string& description, const tcu::UVec3& dispatchSize)
3423 	: vkt::TestCase		(testCtx, name, description)
3424 	, m_dispatchSize	(dispatchSize)
3425 {
3426 	DE_ASSERT(m_dispatchSize.x() == 0u || m_dispatchSize.y() == 0u || m_dispatchSize.z() == 0u);
3427 }
3428 
createInstance(Context & context) const3429 TestInstance* EmptyWorkGroupCase::createInstance (Context& context) const
3430 {
3431 	return new EmptyWorkGroupInstance(context, m_dispatchSize);
3432 }
3433 
initPrograms(vk::SourceCollections & programCollection) const3434 void EmptyWorkGroupCase::initPrograms (vk::SourceCollections& programCollection) const
3435 {
3436 	std::ostringstream comp;
3437 	comp
3438 		<< "#version 450\n"
3439 		<< "layout (local_size_x=1, local_size_y=1, local_size_z=1) in;\n"
3440 		<< "layout (set=0, binding=0) buffer VerificationBlock { uint value; } verif;\n"
3441 		<< "void main () { atomicAdd(verif.value, 1u); }\n"
3442 		;
3443 	programCollection.glslSources.add("comp") << glu::ComputeSource(comp.str());
3444 }
3445 
iterate(void)3446 tcu::TestStatus EmptyWorkGroupInstance::iterate (void)
3447 {
3448 	const auto&		vkd				= m_context.getDeviceInterface();
3449 	const auto		device			= m_context.getDevice();
3450 	auto&			alloc			= m_context.getDefaultAllocator();
3451 	const auto		queueIndex		= m_context.getUniversalQueueFamilyIndex();
3452 	const auto		queue			= m_context.getUniversalQueue();
3453 
3454 	const auto			verifBufferSize		= static_cast<VkDeviceSize>(sizeof(uint32_t));
3455 	const auto			verifBufferInfo		= makeBufferCreateInfo(verifBufferSize, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
3456 	BufferWithMemory	verifBuffer			(vkd, device, alloc, verifBufferInfo, MemoryRequirement::HostVisible);
3457 	auto&				verifBufferAlloc	= verifBuffer.getAllocation();
3458 	void*				verifBufferPtr		= verifBufferAlloc.getHostPtr();
3459 
3460 	deMemset(verifBufferPtr, 0, static_cast<size_t>(verifBufferSize));
3461 	flushAlloc(vkd, device, verifBufferAlloc);
3462 
3463 	DescriptorSetLayoutBuilder layoutBuilder;
3464 	layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT);
3465 	const auto descriptorSetLayout = layoutBuilder.build(vkd, device);
3466 
3467 	const auto pipelineLayout	= makePipelineLayout(vkd, device, descriptorSetLayout.get());
3468 	const auto shaderModule		= createShaderModule(vkd, device, m_context.getBinaryCollection().get("comp"), 0u);
3469 	const auto pipeline			= makeComputePipeline(vkd, device, pipelineLayout.get(), shaderModule.get());
3470 
3471 	DescriptorPoolBuilder poolBuilder;
3472 	poolBuilder.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
3473 	const auto descriptorPool	= poolBuilder.build(vkd, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u);
3474 	const auto descriptorSet	= makeDescriptorSet(vkd, device, descriptorPool.get(), descriptorSetLayout.get());
3475 
3476 	DescriptorSetUpdateBuilder updateBuilder;
3477 	const auto verifBufferDescInfo = makeDescriptorBufferInfo(verifBuffer.get(), 0ull, verifBufferSize);
3478 	updateBuilder.writeSingle(descriptorSet.get(), DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &verifBufferDescInfo);
3479 	updateBuilder.update(vkd, device);
3480 
3481 	const auto cmdPool = makeCommandPool(vkd, device, queueIndex);
3482 	const auto cmdBufferPtr = allocateCommandBuffer(vkd, device, cmdPool.get(), VK_COMMAND_BUFFER_LEVEL_PRIMARY);
3483 	const auto cmdBuffer = cmdBufferPtr.get();
3484 
3485 	beginCommandBuffer(vkd, cmdBuffer);
3486 	vkd.cmdBindPipeline(cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.get());
3487 	vkd.cmdBindDescriptorSets(cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayout.get(), 0u, 1u, &descriptorSet.get(), 0u, nullptr);
3488 	vkd.cmdDispatch(cmdBuffer, m_dispatchSize.x(), m_dispatchSize.y(), m_dispatchSize.z());
3489 
3490 	const auto readWriteAccess	= (VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_SHADER_READ_BIT);
3491 	const auto computeToCompute = makeMemoryBarrier(readWriteAccess, readWriteAccess);
3492 	vkd.cmdPipelineBarrier(cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0U, 1u, &computeToCompute, 0u, nullptr, 0u, nullptr);
3493 
3494 	vkd.cmdDispatch(cmdBuffer, 1u, 1u, 1u);
3495 
3496 	const auto computeToHost = makeMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT);
3497 	vkd.cmdPipelineBarrier(cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0u, 1u, &computeToHost, 0u, nullptr, 0u, nullptr);
3498 
3499 	endCommandBuffer(vkd, cmdBuffer);
3500 	submitCommandsAndWait(vkd, device, queue, cmdBuffer);
3501 
3502 	uint32_t value;
3503 	invalidateAlloc(vkd, device, verifBufferAlloc);
3504 	deMemcpy(&value, verifBufferPtr, sizeof(value));
3505 
3506 	if (value != 1u)
3507 	{
3508 		std::ostringstream msg;
3509 		msg << "Unexpected value found in buffer: " << value << " while expecting 1";
3510 		TCU_FAIL(msg.str());
3511 	}
3512 
3513 	return tcu::TestStatus::pass("Pass");
3514 }
3515 
3516 class MaxWorkGroupSizeTest : public vkt::TestCase
3517 {
3518 public:
3519 	enum class Axis	{ X = 0, Y = 1, Z = 2 };
3520 
3521 	struct Params
3522 	{
3523 		// Which axis to maximize.
3524 		Axis axis;
3525 	};
3526 
3527 							MaxWorkGroupSizeTest	(tcu::TestContext& testCtx, const std::string& name, const std::string& description, const Params& params);
~MaxWorkGroupSizeTest(void)3528 	virtual					~MaxWorkGroupSizeTest	(void) {}
3529 
3530 	virtual void			initPrograms			(vk::SourceCollections& programCollection) const;
3531 	virtual TestInstance*	createInstance			(Context& context) const;
3532 	virtual void			checkSupport			(Context& context) const;
3533 
3534 	// Helper to transform the axis value to an index.
3535 	static int				getIndex				(Axis axis);
3536 
3537 	// Helper returning the number of invocations according to the test parameters.
3538 	static deUint32			getInvocations			(const Params& params, const vk::InstanceInterface& vki, vk::VkPhysicalDevice physicalDevice, const vk::VkPhysicalDeviceProperties* devProperties = nullptr);
3539 
3540 	// Helper returning the buffer size needed to this test.
3541 	static deUint32			getSSBOSize				(deUint32 invocations);
3542 
3543 private:
3544 	Params m_params;
3545 };
3546 
3547 class MaxWorkGroupSizeInstance : public vkt::TestInstance
3548 {
3549 public:
3550 								MaxWorkGroupSizeInstance	(Context& context, const MaxWorkGroupSizeTest::Params& params);
~MaxWorkGroupSizeInstance(void)3551 	virtual						~MaxWorkGroupSizeInstance	(void) {}
3552 
3553 	virtual tcu::TestStatus		iterate			(void);
3554 
3555 private:
3556 	MaxWorkGroupSizeTest::Params m_params;
3557 };
3558 
getIndex(Axis axis)3559 int MaxWorkGroupSizeTest::getIndex (Axis axis)
3560 {
3561 	const int ret = static_cast<int>(axis);
3562 	DE_ASSERT(ret >= static_cast<int>(Axis::X) && ret <= static_cast<int>(Axis::Z));
3563 	return ret;
3564 }
3565 
getInvocations(const Params & params,const vk::InstanceInterface & vki,vk::VkPhysicalDevice physicalDevice,const vk::VkPhysicalDeviceProperties * devProperties)3566 deUint32 MaxWorkGroupSizeTest::getInvocations (const Params& params, const vk::InstanceInterface& vki, vk::VkPhysicalDevice physicalDevice, const vk::VkPhysicalDeviceProperties* devProperties)
3567 {
3568 	const auto axis = getIndex(params.axis);
3569 
3570 	if (devProperties)
3571 		return devProperties->limits.maxComputeWorkGroupSize[axis];
3572 	return vk::getPhysicalDeviceProperties(vki, physicalDevice).limits.maxComputeWorkGroupSize[axis];
3573 }
3574 
getSSBOSize(deUint32 invocations)3575 deUint32 MaxWorkGroupSizeTest::getSSBOSize (deUint32 invocations)
3576 {
3577 	return invocations * static_cast<deUint32>(sizeof(deUint32));
3578 }
3579 
MaxWorkGroupSizeTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const Params & params)3580 MaxWorkGroupSizeTest::MaxWorkGroupSizeTest (tcu::TestContext& testCtx, const std::string& name, const std::string& description, const Params& params)
3581 	: vkt::TestCase	(testCtx, name, description)
3582 	, m_params		(params)
3583 {}
3584 
initPrograms(vk::SourceCollections & programCollection) const3585 void MaxWorkGroupSizeTest::initPrograms (vk::SourceCollections& programCollection) const
3586 {
3587 	std::ostringstream shader;
3588 
3589 	// The actual local sizes will be set using spec constants when running the test instance.
3590 	shader
3591 		<< "#version 450\n"
3592 		<< "\n"
3593 		<< "layout(constant_id=0) const int local_size_x_val = 1;\n"
3594 		<< "layout(constant_id=1) const int local_size_y_val = 1;\n"
3595 		<< "layout(constant_id=2) const int local_size_z_val = 1;\n"
3596 		<< "\n"
3597 		<< "layout(local_size_x_id=0, local_size_y_id=1, local_size_z_id=2) in;\n"
3598 		<< "\n"
3599 		<< "layout(set=0, binding=0) buffer StorageBuffer {\n"
3600 		<< "    uint values[];\n"
3601 		<< "} ssbo;\n"
3602 		<< "\n"
3603 		<< "void main() {\n"
3604 		<< "    ssbo.values[gl_LocalInvocationIndex] = 1u;\n"
3605 		<< "}\n"
3606 		;
3607 
3608 	programCollection.glslSources.add("comp") << glu::ComputeSource(shader.str());
3609 }
3610 
createInstance(Context & context) const3611 TestInstance* MaxWorkGroupSizeTest::createInstance (Context& context) const
3612 {
3613 	return new MaxWorkGroupSizeInstance(context, m_params);
3614 }
3615 
checkSupport(Context & context) const3616 void MaxWorkGroupSizeTest::checkSupport (Context& context) const
3617 {
3618 	const auto&	vki				= context.getInstanceInterface();
3619 	const auto	physicalDevice	= context.getPhysicalDevice();
3620 
3621 	const auto	properties		= vk::getPhysicalDeviceProperties(vki, physicalDevice);
3622 	const auto	invocations		= getInvocations(m_params, vki, physicalDevice, &properties);
3623 
3624 	if (invocations > properties.limits.maxComputeWorkGroupInvocations)
3625 		TCU_FAIL("Reported workgroup size limit in the axis is greater than the global invocation limit");
3626 
3627 	if (properties.limits.maxStorageBufferRange / static_cast<deUint32>(sizeof(deUint32)) < invocations)
3628 		TCU_THROW(NotSupportedError, "Maximum supported storage buffer range too small");
3629 }
3630 
MaxWorkGroupSizeInstance(Context & context,const MaxWorkGroupSizeTest::Params & params)3631 MaxWorkGroupSizeInstance::MaxWorkGroupSizeInstance (Context& context, const MaxWorkGroupSizeTest::Params& params)
3632 	: vkt::TestInstance	(context)
3633 	, m_params			(params)
3634 {}
3635 
iterate(void)3636 tcu::TestStatus MaxWorkGroupSizeInstance::iterate (void)
3637 {
3638 	const auto&	vki				= m_context.getInstanceInterface();
3639 	const auto&	vkd				= m_context.getDeviceInterface();
3640 	const auto	physicalDevice	= m_context.getPhysicalDevice();
3641 	const auto	device			= m_context.getDevice();
3642 	auto&		alloc			= m_context.getDefaultAllocator();
3643 	const auto	queueIndex		= m_context.getUniversalQueueFamilyIndex();
3644 	const auto	queue			= m_context.getUniversalQueue();
3645 	auto&		log				= m_context.getTestContext().getLog();
3646 
3647 	const auto	axis			= MaxWorkGroupSizeTest::getIndex(m_params.axis);
3648 	const auto	invocations		= MaxWorkGroupSizeTest::getInvocations(m_params, vki, physicalDevice);
3649 	const auto	ssboSize		= static_cast<vk::VkDeviceSize>(MaxWorkGroupSizeTest::getSSBOSize(invocations));
3650 
3651 	log
3652 		<< tcu::TestLog::Message
3653 		<< "Running test with " << invocations << " invocations on axis " << axis << " using a storage buffer size of " << ssboSize << " bytes"
3654 		<< tcu::TestLog::EndMessage
3655 		;
3656 
3657 	// Main SSBO buffer.
3658 	const auto				ssboInfo	= vk::makeBufferCreateInfo(ssboSize, vk::VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
3659 	vk::BufferWithMemory	ssbo		(vkd, device, alloc, ssboInfo, vk::MemoryRequirement::HostVisible);
3660 
3661 	// Shader module.
3662 	const auto shaderModule	= vk::createShaderModule(vkd, device, m_context.getBinaryCollection().get("comp"), 0u);
3663 
3664 	// Descriptor set layouts.
3665 	vk::DescriptorSetLayoutBuilder layoutBuilder;
3666 	layoutBuilder.addSingleBinding(vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, vk::VK_SHADER_STAGE_COMPUTE_BIT);
3667 	const auto descriptorSetLayout = layoutBuilder.build(vkd, device);
3668 
3669 	// Specialization constants: set the number of invocations in the appropriate local size id.
3670 	const auto	entrySize				= static_cast<deUintptr>(sizeof(deInt32));
3671 	deInt32		specializationData[3]	= { 1, 1, 1 };
3672 	specializationData[axis] = static_cast<deInt32>(invocations);
3673 
3674 	const vk::VkSpecializationMapEntry specializationMaps[3] =
3675 	{
3676 		{
3677 			0u,										//	deUint32	constantID;
3678 			0u,										//	deUint32	offset;
3679 			entrySize,								//	deUintptr	size;
3680 		},
3681 		{
3682 			1u,										//	deUint32	constantID;
3683 			static_cast<deUint32>(entrySize),		//	deUint32	offset;
3684 			entrySize,								//	deUintptr	size;
3685 		},
3686 		{
3687 			2u,										//	deUint32	constantID;
3688 			static_cast<deUint32>(entrySize * 2u),	//	deUint32	offset;
3689 			entrySize,								//	deUintptr	size;
3690 		},
3691 	};
3692 
3693 	const vk::VkSpecializationInfo specializationInfo =
3694 	{
3695 		3u,													//	deUint32						mapEntryCount;
3696 		specializationMaps,									//	const VkSpecializationMapEntry*	pMapEntries;
3697 		static_cast<deUintptr>(sizeof(specializationData)),	//	deUintptr						dataSize;
3698 		specializationData,									//	const void*						pData;
3699 	};
3700 
3701 	// Test pipeline.
3702 	const vk::VkPipelineLayoutCreateInfo testPipelineLayoutInfo =
3703 	{
3704 		vk::VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,	//	VkStructureType					sType;
3705 		nullptr,											//	const void*						pNext;
3706 		0u,													//	VkPipelineLayoutCreateFlags		flags;
3707 		1u,													//	deUint32						setLayoutCount;
3708 		&descriptorSetLayout.get(),							//	const VkDescriptorSetLayout*	pSetLayouts;
3709 		0u,													//	deUint32						pushConstantRangeCount;
3710 		nullptr,											//	const VkPushConstantRange*		pPushConstantRanges;
3711 	};
3712 	const auto testPipelineLayout = vk::createPipelineLayout(vkd, device, &testPipelineLayoutInfo);
3713 
3714 	const vk::VkComputePipelineCreateInfo testPipelineInfo =
3715 	{
3716 		vk::VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,	//	VkStructureType					sType;
3717 		nullptr,											//	const void*						pNext;
3718 		0u,													//	VkPipelineCreateFlags			flags;
3719 		{													//	VkPipelineShaderStageCreateInfo	stage;
3720 			vk::VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,//	VkStructureType						sType;
3721 			nullptr,												//	const void*							pNext;
3722 			0u,														//	VkPipelineShaderStageCreateFlags	flags;
3723 			vk::VK_SHADER_STAGE_COMPUTE_BIT,						//	VkShaderStageFlagBits				stage;
3724 			shaderModule.get(),										//	VkShaderModule						module;
3725 			"main",													//	const char*							pName;
3726 			&specializationInfo,									//	const VkSpecializationInfo*			pSpecializationInfo;
3727 		},
3728 		testPipelineLayout.get(),							//	VkPipelineLayout				layout;
3729 		DE_NULL,											//	VkPipeline						basePipelineHandle;
3730 		0u,													//	deInt32							basePipelineIndex;
3731 	};
3732 	const auto testPipeline = vk::createComputePipeline(vkd, device, DE_NULL, &testPipelineInfo);
3733 
3734 	// Create descriptor pool and set.
3735 	vk::DescriptorPoolBuilder poolBuilder;
3736 	poolBuilder.addType(vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
3737 	const auto descriptorPool	= poolBuilder.build(vkd, device, vk::VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u);
3738 	const auto descriptorSet	= vk::makeDescriptorSet(vkd, device, descriptorPool.get(), descriptorSetLayout.get());
3739 
3740 	// Update descriptor set.
3741 	const vk::VkDescriptorBufferInfo ssboBufferInfo =
3742 	{
3743 		ssbo.get(),		//	VkBuffer		buffer;
3744 		0u,				//	VkDeviceSize	offset;
3745 		VK_WHOLE_SIZE,	//	VkDeviceSize	range;
3746 	};
3747 
3748 	vk::DescriptorSetUpdateBuilder updateBuilder;
3749 	updateBuilder.writeSingle(descriptorSet.get(), vk::DescriptorSetUpdateBuilder::Location::binding(0u), vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &ssboBufferInfo);
3750 	updateBuilder.update(vkd, device);
3751 
3752 	// Clear buffer.
3753 	auto& ssboAlloc	= ssbo.getAllocation();
3754 	void* ssboPtr	= ssboAlloc.getHostPtr();
3755 	deMemset(ssboPtr, 0, static_cast<size_t>(ssboSize));
3756 	vk::flushAlloc(vkd, device, ssboAlloc);
3757 
3758 	// Run pipelines.
3759 	const auto cmdPool		= vk::makeCommandPool(vkd, device, queueIndex);
3760 	const auto cmdBUfferPtr	= vk::allocateCommandBuffer(vkd, device, cmdPool.get(), vk::VK_COMMAND_BUFFER_LEVEL_PRIMARY);
3761 	const auto cmdBuffer	= cmdBUfferPtr.get();
3762 
3763 	vk::beginCommandBuffer(vkd, cmdBuffer);
3764 
3765 	// Run the main test shader.
3766 	const auto hostToComputeBarrier = vk::makeBufferMemoryBarrier(vk::VK_ACCESS_HOST_WRITE_BIT, vk::VK_ACCESS_SHADER_WRITE_BIT, ssbo.get(), 0ull, VK_WHOLE_SIZE);
3767 	vkd.cmdPipelineBarrier(cmdBuffer, vk::VK_PIPELINE_STAGE_HOST_BIT, vk::VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0u, 0u, nullptr, 1u, &hostToComputeBarrier, 0u, nullptr);
3768 
3769 	vkd.cmdBindPipeline(cmdBuffer, vk::VK_PIPELINE_BIND_POINT_COMPUTE, testPipeline.get());
3770 	vkd.cmdBindDescriptorSets(cmdBuffer, vk::VK_PIPELINE_BIND_POINT_COMPUTE, testPipelineLayout.get(), 0u, 1u, &descriptorSet.get(), 0u, nullptr);
3771 	vkd.cmdDispatch(cmdBuffer, 1u, 1u, 1u);
3772 
3773 	const auto computeToHostBarrier = vk::makeBufferMemoryBarrier(vk::VK_ACCESS_SHADER_WRITE_BIT, vk::VK_ACCESS_HOST_READ_BIT, ssbo.get(), 0ull, VK_WHOLE_SIZE);
3774 	vkd.cmdPipelineBarrier(cmdBuffer, vk::VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, vk::VK_PIPELINE_STAGE_HOST_BIT, 0u, 0u, nullptr, 1u, &computeToHostBarrier, 0u, nullptr);
3775 
3776 	vk::endCommandBuffer(vkd, cmdBuffer);
3777 	vk::submitCommandsAndWait(vkd, device, queue, cmdBuffer);
3778 
3779 	// Verify buffer contents.
3780 	vk::invalidateAlloc(vkd, device, ssboAlloc);
3781 	std::unique_ptr<deUint32[]>	valuesArray	(new deUint32[invocations]);
3782 	deUint32*					valuesPtr	= valuesArray.get();
3783 	deMemcpy(valuesPtr, ssboPtr, static_cast<size_t>(ssboSize));
3784 
3785 	std::string	errorMsg;
3786 	bool		ok			= true;
3787 
3788 	for (size_t i = 0; i < invocations; ++i)
3789 	{
3790 		if (valuesPtr[i] != 1u)
3791 		{
3792 			ok			= false;
3793 			errorMsg	= "Found invalid value for invocation index " + de::toString(i) + ": expected 1u and found " + de::toString(valuesPtr[i]);
3794 			break;
3795 		}
3796 	}
3797 
3798 	if (!ok)
3799 		return tcu::TestStatus::fail(errorMsg);
3800 	return tcu::TestStatus::pass("Pass");
3801 }
3802 
3803 namespace EmptyShaderTest
3804 {
3805 
createProgram(SourceCollections & dst)3806 void createProgram (SourceCollections& dst)
3807 {
3808 	dst.glslSources.add("comp") << glu::ComputeSource(
3809 		"#version 310 es\n"
3810 		"layout (local_size_x = 1) in;\n"
3811 		"void main (void) {}\n"
3812 	);
3813 }
3814 
createTest(Context & context)3815 tcu::TestStatus createTest (Context& context)
3816 {
3817 	const DeviceInterface&	vk					= context.getDeviceInterface();
3818 	const VkDevice			device				= context.getDevice();
3819 	const VkQueue			queue				= context.getUniversalQueue();
3820 	const deUint32			queueFamilyIndex	= context.getUniversalQueueFamilyIndex();
3821 
3822 	const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, context.getBinaryCollection().get("comp"), 0u));
3823 
3824 	const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device));
3825 	const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
3826 
3827 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
3828 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3829 
3830 	// Start recording commands
3831 
3832 	beginCommandBuffer(vk, *cmdBuffer);
3833 
3834 	vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
3835 
3836 	const tcu::IVec3 workGroups(1, 1, 1);
3837 	vk.cmdDispatch(*cmdBuffer, workGroups.x(), workGroups.y(), workGroups.z());
3838 
3839 	endCommandBuffer(vk, *cmdBuffer);
3840 
3841 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
3842 
3843 	return tcu::TestStatus::pass("Compute succeeded");
3844 }
3845 
3846 } // EmptyShaderTest ns
3847 } // anonymous
3848 
createBasicComputeShaderTests(tcu::TestContext & testCtx)3849 tcu::TestCaseGroup* createBasicComputeShaderTests (tcu::TestContext& testCtx)
3850 {
3851 	de::MovePtr<tcu::TestCaseGroup> basicComputeTests(new tcu::TestCaseGroup(testCtx, "basic", "Basic compute tests"));
3852 
3853 	addFunctionCaseWithPrograms(basicComputeTests.get(), "empty_shader", "Shader that does nothing", EmptyShaderTest::createProgram, EmptyShaderTest::createTest);
3854 
3855 	basicComputeTests->addChild(new ConcurrentCompute(testCtx, "concurrent_compute", "Concurrent compute test"));
3856 
3857 	basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_x", "Use an empty workgroup with size 0 on the X axis", tcu::UVec3(0u, 2u, 3u)));
3858 	basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_y", "Use an empty workgroup with size 0 on the Y axis", tcu::UVec3(2u, 0u, 3u)));
3859 	basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_z", "Use an empty workgroup with size 0 on the Z axis", tcu::UVec3(2u, 3u, 0u)));
3860 	basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_all", "Use an empty workgroup with size 0 on the X, Y and Z axes", tcu::UVec3(0u, 0u, 0u)));
3861 
3862 	basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_x", "Use the maximum work group size on the X axis", MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::X}));
3863 	basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_y", "Use the maximum work group size on the Y axis", MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::Y}));
3864 	basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_z", "Use the maximum work group size on the Z axis", MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::Z}));
3865 
3866 	basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx,	"ubo_to_ssbo_single_invocation",	"Copy from UBO to SSBO, inverting bits",	256,	tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1)));
3867 	basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx,	"ubo_to_ssbo_single_group",			"Copy from UBO to SSBO, inverting bits",	1024,	tcu::IVec3(2,1,4),	tcu::IVec3(1,1,1)));
3868 	basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx,	"ubo_to_ssbo_multiple_invocations",	"Copy from UBO to SSBO, inverting bits",	1024,	tcu::IVec3(1,1,1),	tcu::IVec3(2,4,1)));
3869 	basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx,	"ubo_to_ssbo_multiple_groups",		"Copy from UBO to SSBO, inverting bits",	1024,	tcu::IVec3(1,4,2),	tcu::IVec3(2,2,4)));
3870 
3871 	basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx,	"copy_ssbo_single_invocation",		"Copy between SSBOs, inverting bits",	256,	tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1)));
3872 	basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx,	"copy_ssbo_multiple_invocations",	"Copy between SSBOs, inverting bits",	1024,	tcu::IVec3(1,1,1),	tcu::IVec3(2,4,1)));
3873 	basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx,	"copy_ssbo_multiple_groups",		"Copy between SSBOs, inverting bits",	1024,	tcu::IVec3(1,4,2),	tcu::IVec3(2,2,4)));
3874 
3875 	basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx,	"ssbo_rw_single_invocation",			"Read and write same SSBO",		256,	true,	tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1)));
3876 	basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx,	"ssbo_rw_multiple_groups",				"Read and write same SSBO",		1024,	true,	tcu::IVec3(1,4,2),	tcu::IVec3(2,2,4)));
3877 	basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx,	"ssbo_unsized_arr_single_invocation",	"Read and write same SSBO",		256,	false,	tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1)));
3878 	basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx,	"ssbo_unsized_arr_multiple_groups",		"Read and write same SSBO",		1024,	false,	tcu::IVec3(1,4,2),	tcu::IVec3(2,2,4)));
3879 
3880 	basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx,	"write_multiple_arr_single_invocation",			"Write to multiple SSBOs",	256,	true,	tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1)));
3881 	basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx,	"write_multiple_arr_multiple_groups",			"Write to multiple SSBOs",	1024,	true,	tcu::IVec3(1,4,2),	tcu::IVec3(2,2,4)));
3882 	basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx,	"write_multiple_unsized_arr_single_invocation",	"Write to multiple SSBOs",	256,	false,	tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1)));
3883 	basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx,	"write_multiple_unsized_arr_multiple_groups",	"Write to multiple SSBOs",	1024,	false,	tcu::IVec3(1,4,2),	tcu::IVec3(2,2,4)));
3884 
3885 	basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx,	"ssbo_local_barrier_single_invocation",	"SSBO local barrier usage",	tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1)));
3886 	basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx,	"ssbo_local_barrier_single_group",		"SSBO local barrier usage",	tcu::IVec3(3,2,5),	tcu::IVec3(1,1,1)));
3887 	basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx,	"ssbo_local_barrier_multiple_groups",	"SSBO local barrier usage",	tcu::IVec3(3,4,1),	tcu::IVec3(2,7,3)));
3888 
3889 	basicComputeTests->addChild(new SSBOBarrierTest(testCtx,	"ssbo_cmd_barrier_single",		"SSBO memory barrier usage",	tcu::IVec3(1,1,1)));
3890 	basicComputeTests->addChild(new SSBOBarrierTest(testCtx,	"ssbo_cmd_barrier_multiple",	"SSBO memory barrier usage",	tcu::IVec3(11,5,7)));
3891 
3892 	basicComputeTests->addChild(new SharedVarTest(testCtx,	"shared_var_single_invocation",		"Basic shared variable usage",	tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1)));
3893 	basicComputeTests->addChild(new SharedVarTest(testCtx,	"shared_var_single_group",			"Basic shared variable usage",	tcu::IVec3(3,2,5),	tcu::IVec3(1,1,1)));
3894 	basicComputeTests->addChild(new SharedVarTest(testCtx,	"shared_var_multiple_invocations",	"Basic shared variable usage",	tcu::IVec3(1,1,1),	tcu::IVec3(2,5,4)));
3895 	basicComputeTests->addChild(new SharedVarTest(testCtx,	"shared_var_multiple_groups",		"Basic shared variable usage",	tcu::IVec3(3,4,1),	tcu::IVec3(2,7,3)));
3896 
3897 	basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx,	"shared_atomic_op_single_invocation",		"Atomic operation with shared var",		tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1)));
3898 	basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx,	"shared_atomic_op_single_group",			"Atomic operation with shared var",		tcu::IVec3(3,2,5),	tcu::IVec3(1,1,1)));
3899 	basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx,	"shared_atomic_op_multiple_invocations",	"Atomic operation with shared var",		tcu::IVec3(1,1,1),	tcu::IVec3(2,5,4)));
3900 	basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx,	"shared_atomic_op_multiple_groups",			"Atomic operation with shared var",		tcu::IVec3(3,4,1),	tcu::IVec3(2,7,3)));
3901 
3902 	basicComputeTests->addChild(new CopyImageToSSBOTest(testCtx,	"copy_image_to_ssbo_small",	"Image to SSBO copy",	tcu::IVec2(1,1),	tcu::IVec2(64,64)));
3903 	basicComputeTests->addChild(new CopyImageToSSBOTest(testCtx,	"copy_image_to_ssbo_large",	"Image to SSBO copy",	tcu::IVec2(2,4),	tcu::IVec2(512,512)));
3904 
3905 	basicComputeTests->addChild(new CopySSBOToImageTest(testCtx,	"copy_ssbo_to_image_small",	"SSBO to image copy",	tcu::IVec2(1, 1),	tcu::IVec2(64, 64)));
3906 	basicComputeTests->addChild(new CopySSBOToImageTest(testCtx,	"copy_ssbo_to_image_large",	"SSBO to image copy",	tcu::IVec2(2, 4),	tcu::IVec2(512, 512)));
3907 
3908 	basicComputeTests->addChild(new ImageAtomicOpTest(testCtx,	"image_atomic_op_local_size_1",	"Atomic operation with image",	1,	tcu::IVec2(64,64)));
3909 	basicComputeTests->addChild(new ImageAtomicOpTest(testCtx,	"image_atomic_op_local_size_8",	"Atomic operation with image",	8,	tcu::IVec2(64,64)));
3910 
3911 	basicComputeTests->addChild(new ImageBarrierTest(testCtx,	"image_barrier_single",		"Image barrier",	tcu::IVec2(1,1)));
3912 	basicComputeTests->addChild(new ImageBarrierTest(testCtx,	"image_barrier_multiple",	"Image barrier",	tcu::IVec2(64,64)));
3913 
3914 #ifndef CTS_USES_VULKANSC
3915 	basicComputeTests->addChild(cts_amber::createAmberTestCase(testCtx, "write_ssbo_array", "", "compute", "write_ssbo_array.amber"));
3916 	basicComputeTests->addChild(cts_amber::createAmberTestCase(testCtx, "branch_past_barrier", "", "compute", "branch_past_barrier.amber"));
3917 #endif
3918 
3919 	return basicComputeTests.release();
3920 }
3921 
createBasicDeviceGroupComputeShaderTests(tcu::TestContext & testCtx)3922 tcu::TestCaseGroup* createBasicDeviceGroupComputeShaderTests (tcu::TestContext& testCtx)
3923 {
3924 	de::MovePtr<tcu::TestCaseGroup> deviceGroupComputeTests(new tcu::TestCaseGroup(testCtx, "device_group", "Basic device group compute tests"));
3925 
3926 	deviceGroupComputeTests->addChild(new DispatchBaseTest(testCtx,	"dispatch_base",	"Compute shader with base groups",				32768,	tcu::IVec3(4,2,4),	tcu::IVec3(16,8,8),	tcu::IVec3(4,8,8)));
3927 	deviceGroupComputeTests->addChild(new DeviceIndexTest(testCtx,	"device_index",		"Compute shader using deviceIndex in SPIRV",	96,		tcu::IVec3(3,2,1),	tcu::IVec3(2,4,1)));
3928 
3929 	return deviceGroupComputeTests.release();
3930 
3931 }
3932 } // compute
3933 } // vkt
3934