• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*------------------------------------------------------------------------
2  * Vulkan Conformance Tests
3  * ------------------------
4  *
5  * Copyright (c) 2019 The Khronos Group Inc.
6  * Copyright (c) 2019 The Android Open Source Project
7  *
8  * Licensed under the Apache License, Version 2.0 (the "License");
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  *
12  *      http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  *
20  *//*!
21  * \file
22  * \brief Compute Shader Tests
23  *//*--------------------------------------------------------------------*/
24 
25 #include "vktComputeBasicComputeShaderTests.hpp"
26 #include "vktTestCase.hpp"
27 #include "vktTestCaseUtil.hpp"
28 #include "vktComputeTestsUtil.hpp"
29 #include "vktCustomInstancesDevices.hpp"
30 #include "vktAmberTestCase.hpp"
31 
32 #include "vkDefs.hpp"
33 #include "vkRef.hpp"
34 #include "vkRefUtil.hpp"
35 #include "vkPlatform.hpp"
36 #include "vkPrograms.hpp"
37 #include "vkRefUtil.hpp"
38 #include "vkMemUtil.hpp"
39 #include "vkBarrierUtil.hpp"
40 #include "vkQueryUtil.hpp"
41 #include "vkBuilderUtil.hpp"
42 #include "vkTypeUtil.hpp"
43 #include "vkDeviceUtil.hpp"
44 #include "vkCmdUtil.hpp"
45 #include "vkObjUtil.hpp"
46 #include "vkBufferWithMemory.hpp"
47 
48 #include "tcuCommandLine.hpp"
49 #include "tcuTestLog.hpp"
50 
51 #include "deStringUtil.hpp"
52 #include "deUniquePtr.hpp"
53 #include "deRandom.hpp"
54 
55 #include <vector>
56 #include <memory>
57 
58 using namespace vk;
59 
60 namespace vkt
61 {
62 namespace compute
63 {
64 namespace
65 {
66 
67 template<typename T, int size>
multiplyComponents(const tcu::Vector<T,size> & v)68 T multiplyComponents (const tcu::Vector<T, size>& v)
69 {
70 	T accum = 1;
71 	for (int i = 0; i < size; ++i)
72 		accum *= v[i];
73 	return accum;
74 }
75 
76 template<typename T>
squared(const T & a)77 inline T squared (const T& a)
78 {
79 	return a * a;
80 }
81 
make2DImageCreateInfo(const tcu::IVec2 & imageSize,const VkImageUsageFlags usage)82 inline VkImageCreateInfo make2DImageCreateInfo (const tcu::IVec2& imageSize, const VkImageUsageFlags usage)
83 {
84 	const VkImageCreateInfo imageParams =
85 	{
86 		VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,				// VkStructureType			sType;
87 		DE_NULL,											// const void*				pNext;
88 		0u,													// VkImageCreateFlags		flags;
89 		VK_IMAGE_TYPE_2D,									// VkImageType				imageType;
90 		VK_FORMAT_R32_UINT,									// VkFormat					format;
91 		vk::makeExtent3D(imageSize.x(), imageSize.y(), 1),	// VkExtent3D				extent;
92 		1u,													// deUint32					mipLevels;
93 		1u,													// deUint32					arrayLayers;
94 		VK_SAMPLE_COUNT_1_BIT,								// VkSampleCountFlagBits	samples;
95 		VK_IMAGE_TILING_OPTIMAL,							// VkImageTiling			tiling;
96 		usage,												// VkImageUsageFlags		usage;
97 		VK_SHARING_MODE_EXCLUSIVE,							// VkSharingMode			sharingMode;
98 		0u,													// deUint32					queueFamilyIndexCount;
99 		DE_NULL,											// const deUint32*			pQueueFamilyIndices;
100 		VK_IMAGE_LAYOUT_UNDEFINED,							// VkImageLayout			initialLayout;
101 	};
102 	return imageParams;
103 }
104 
makeBufferImageCopy(const tcu::IVec2 & imageSize)105 inline VkBufferImageCopy makeBufferImageCopy(const tcu::IVec2& imageSize)
106 {
107 	return compute::makeBufferImageCopy(vk::makeExtent3D(imageSize.x(), imageSize.y(), 1), 1u);
108 }
109 
110 enum BufferType
111 {
112 	BUFFER_TYPE_UNIFORM,
113 	BUFFER_TYPE_SSBO,
114 };
115 
116 class SharedVarTest : public vkt::TestCase
117 {
118 public:
119 						SharedVarTest	(tcu::TestContext&		testCtx,
120 										 const std::string&		name,
121 										 const std::string&		description,
122 										 const tcu::IVec3&		localSize,
123 										 const tcu::IVec3&		workSize);
124 
125 	void				initPrograms	(SourceCollections&		sourceCollections) const;
126 	TestInstance*		createInstance	(Context&				context) const;
127 
128 private:
129 	const tcu::IVec3	m_localSize;
130 	const tcu::IVec3	m_workSize;
131 };
132 
133 class SharedVarTestInstance : public vkt::TestInstance
134 {
135 public:
136 									SharedVarTestInstance	(Context&			context,
137 															 const tcu::IVec3&	localSize,
138 															 const tcu::IVec3&	workSize);
139 
140 	tcu::TestStatus					iterate					(void);
141 
142 private:
143 	const tcu::IVec3				m_localSize;
144 	const tcu::IVec3				m_workSize;
145 };
146 
SharedVarTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)147 SharedVarTest::SharedVarTest (tcu::TestContext&		testCtx,
148 							  const std::string&	name,
149 							  const std::string&	description,
150 							  const tcu::IVec3&		localSize,
151 							  const tcu::IVec3&		workSize)
152 	: TestCase		(testCtx, name, description)
153 	, m_localSize	(localSize)
154 	, m_workSize	(workSize)
155 {
156 }
157 
initPrograms(SourceCollections & sourceCollections) const158 void SharedVarTest::initPrograms (SourceCollections& sourceCollections) const
159 {
160 	const int workGroupSize = multiplyComponents(m_localSize);
161 	const int workGroupCount = multiplyComponents(m_workSize);
162 	const int numValues = workGroupSize * workGroupCount;
163 
164 	std::ostringstream src;
165 	src << "#version 310 es\n"
166 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
167 		<< "layout(binding = 0) writeonly buffer Output {\n"
168 		<< "    uint values[" << numValues << "];\n"
169 		<< "} sb_out;\n\n"
170 		<< "shared uint offsets[" << workGroupSize << "];\n\n"
171 		<< "void main (void) {\n"
172 		<< "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
173 		<< "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
174 		<< "    uint globalOffs = localSize*globalNdx;\n"
175 		<< "    uint localOffs  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_LocalInvocationID.z + gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x;\n"
176 		<< "\n"
177 		<< "    offsets[localSize-localOffs-1u] = globalOffs + localOffs*localOffs;\n"
178 		<< "    memoryBarrierShared();\n"
179 		<< "    barrier();\n"
180 		<< "    sb_out.values[globalOffs + localOffs] = offsets[localOffs];\n"
181 		<< "}\n";
182 
183 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
184 }
185 
createInstance(Context & context) const186 TestInstance* SharedVarTest::createInstance (Context& context) const
187 {
188 	return new SharedVarTestInstance(context, m_localSize, m_workSize);
189 }
190 
SharedVarTestInstance(Context & context,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)191 SharedVarTestInstance::SharedVarTestInstance (Context& context, const tcu::IVec3& localSize, const tcu::IVec3& workSize)
192 	: TestInstance	(context)
193 	, m_localSize	(localSize)
194 	, m_workSize	(workSize)
195 {
196 }
197 
iterate(void)198 tcu::TestStatus SharedVarTestInstance::iterate (void)
199 {
200 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
201 	const VkDevice			device				= m_context.getDevice();
202 	const VkQueue			queue				= m_context.getUniversalQueue();
203 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
204 	Allocator&				allocator			= m_context.getDefaultAllocator();
205 
206 	const int workGroupSize = multiplyComponents(m_localSize);
207 	const int workGroupCount = multiplyComponents(m_workSize);
208 
209 	// Create a buffer and host-visible memory for it
210 
211 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * workGroupSize * workGroupCount;
212 	const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
213 
214 	// Create descriptor set
215 
216 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
217 		DescriptorSetLayoutBuilder()
218 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
219 		.build(vk, device));
220 
221 	const Unique<VkDescriptorPool> descriptorPool(
222 		DescriptorPoolBuilder()
223 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
224 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
225 
226 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
227 
228 	const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
229 	DescriptorSetUpdateBuilder()
230 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
231 		.update(vk, device);
232 
233 	// Perform the computation
234 
235 	const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
236 	const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
237 	const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
238 
239 	const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
240 
241 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
242 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
243 
244 	// Start recording commands
245 
246 	beginCommandBuffer(vk, *cmdBuffer);
247 
248 	vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
249 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
250 
251 	vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
252 
253 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
254 
255 	endCommandBuffer(vk, *cmdBuffer);
256 
257 	// Wait for completion
258 
259 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
260 
261 	// Validate the results
262 
263 	const Allocation& bufferAllocation = buffer.getAllocation();
264 	invalidateAlloc(vk, device, bufferAllocation);
265 
266 	const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
267 
268 	for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
269 	{
270 		const int globalOffset = groupNdx * workGroupSize;
271 		for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
272 		{
273 			const deUint32 res = bufferPtr[globalOffset + localOffset];
274 			const deUint32 ref = globalOffset + squared(workGroupSize - localOffset - 1);
275 
276 			if (res != ref)
277 			{
278 				std::ostringstream msg;
279 				msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
280 				return tcu::TestStatus::fail(msg.str());
281 			}
282 		}
283 	}
284 	return tcu::TestStatus::pass("Compute succeeded");
285 }
286 
287 class SharedVarAtomicOpTest : public vkt::TestCase
288 {
289 public:
290 						SharedVarAtomicOpTest	(tcu::TestContext&	testCtx,
291 												 const std::string&	name,
292 												 const std::string&	description,
293 												 const tcu::IVec3&	localSize,
294 												 const tcu::IVec3&	workSize);
295 
296 	void				initPrograms			(SourceCollections& sourceCollections) const;
297 	TestInstance*		createInstance			(Context&			context) const;
298 
299 private:
300 	const tcu::IVec3	m_localSize;
301 	const tcu::IVec3	m_workSize;
302 };
303 
304 class SharedVarAtomicOpTestInstance : public vkt::TestInstance
305 {
306 public:
307 									SharedVarAtomicOpTestInstance	(Context&			context,
308 																	 const tcu::IVec3&	localSize,
309 																	 const tcu::IVec3&	workSize);
310 
311 	tcu::TestStatus					iterate							(void);
312 
313 private:
314 	const tcu::IVec3				m_localSize;
315 	const tcu::IVec3				m_workSize;
316 };
317 
SharedVarAtomicOpTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)318 SharedVarAtomicOpTest::SharedVarAtomicOpTest (tcu::TestContext&		testCtx,
319 											  const std::string&	name,
320 											  const std::string&	description,
321 											  const tcu::IVec3&		localSize,
322 											  const tcu::IVec3&		workSize)
323 	: TestCase		(testCtx, name, description)
324 	, m_localSize	(localSize)
325 	, m_workSize	(workSize)
326 {
327 }
328 
initPrograms(SourceCollections & sourceCollections) const329 void SharedVarAtomicOpTest::initPrograms (SourceCollections& sourceCollections) const
330 {
331 	const int workGroupSize = multiplyComponents(m_localSize);
332 	const int workGroupCount = multiplyComponents(m_workSize);
333 	const int numValues = workGroupSize * workGroupCount;
334 
335 	std::ostringstream src;
336 	src << "#version 310 es\n"
337 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
338 		<< "layout(binding = 0) writeonly buffer Output {\n"
339 		<< "    uint values[" << numValues << "];\n"
340 		<< "} sb_out;\n\n"
341 		<< "shared uint count;\n\n"
342 		<< "void main (void) {\n"
343 		<< "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
344 		<< "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
345 		<< "    uint globalOffs = localSize*globalNdx;\n"
346 		<< "\n"
347 		<< "    count = 0u;\n"
348 		<< "    memoryBarrierShared();\n"
349 		<< "    barrier();\n"
350 		<< "    uint oldVal = atomicAdd(count, 1u);\n"
351 		<< "    sb_out.values[globalOffs+oldVal] = oldVal+1u;\n"
352 		<< "}\n";
353 
354 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
355 }
356 
createInstance(Context & context) const357 TestInstance* SharedVarAtomicOpTest::createInstance (Context& context) const
358 {
359 	return new SharedVarAtomicOpTestInstance(context, m_localSize, m_workSize);
360 }
361 
SharedVarAtomicOpTestInstance(Context & context,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)362 SharedVarAtomicOpTestInstance::SharedVarAtomicOpTestInstance (Context& context, const tcu::IVec3& localSize, const tcu::IVec3& workSize)
363 	: TestInstance	(context)
364 	, m_localSize	(localSize)
365 	, m_workSize	(workSize)
366 {
367 }
368 
iterate(void)369 tcu::TestStatus SharedVarAtomicOpTestInstance::iterate (void)
370 {
371 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
372 	const VkDevice			device				= m_context.getDevice();
373 	const VkQueue			queue				= m_context.getUniversalQueue();
374 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
375 	Allocator&				allocator			= m_context.getDefaultAllocator();
376 
377 	const int workGroupSize = multiplyComponents(m_localSize);
378 	const int workGroupCount = multiplyComponents(m_workSize);
379 
380 	// Create a buffer and host-visible memory for it
381 
382 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * workGroupSize * workGroupCount;
383 	const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
384 
385 	// Create descriptor set
386 
387 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
388 		DescriptorSetLayoutBuilder()
389 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
390 		.build(vk, device));
391 
392 	const Unique<VkDescriptorPool> descriptorPool(
393 		DescriptorPoolBuilder()
394 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
395 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
396 
397 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
398 
399 	const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
400 	DescriptorSetUpdateBuilder()
401 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
402 		.update(vk, device);
403 
404 	// Perform the computation
405 
406 	const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
407 	const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
408 	const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
409 
410 	const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
411 
412 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
413 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
414 
415 	// Start recording commands
416 
417 	beginCommandBuffer(vk, *cmdBuffer);
418 
419 	vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
420 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
421 
422 	vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
423 
424 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1u, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
425 
426 	endCommandBuffer(vk, *cmdBuffer);
427 
428 	// Wait for completion
429 
430 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
431 
432 	// Validate the results
433 
434 	const Allocation& bufferAllocation = buffer.getAllocation();
435 	invalidateAlloc(vk, device, bufferAllocation);
436 
437 	const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
438 
439 	for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
440 	{
441 		const int globalOffset = groupNdx * workGroupSize;
442 		for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
443 		{
444 			const deUint32 res = bufferPtr[globalOffset + localOffset];
445 			const deUint32 ref = localOffset + 1;
446 
447 			if (res != ref)
448 			{
449 				std::ostringstream msg;
450 				msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
451 				return tcu::TestStatus::fail(msg.str());
452 			}
453 		}
454 	}
455 	return tcu::TestStatus::pass("Compute succeeded");
456 }
457 
458 class SSBOLocalBarrierTest : public vkt::TestCase
459 {
460 public:
461 						SSBOLocalBarrierTest	(tcu::TestContext&	testCtx,
462 												 const std::string& name,
463 												 const std::string&	description,
464 												 const tcu::IVec3&	localSize,
465 												 const tcu::IVec3&	workSize);
466 
467 	void				initPrograms			(SourceCollections& sourceCollections) const;
468 	TestInstance*		createInstance			(Context&			context) const;
469 
470 private:
471 	const tcu::IVec3	m_localSize;
472 	const tcu::IVec3	m_workSize;
473 };
474 
475 class SSBOLocalBarrierTestInstance : public vkt::TestInstance
476 {
477 public:
478 									SSBOLocalBarrierTestInstance	(Context&			context,
479 																	 const tcu::IVec3&	localSize,
480 																	 const tcu::IVec3&	workSize);
481 
482 	tcu::TestStatus					iterate							(void);
483 
484 private:
485 	const tcu::IVec3				m_localSize;
486 	const tcu::IVec3				m_workSize;
487 };
488 
SSBOLocalBarrierTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)489 SSBOLocalBarrierTest::SSBOLocalBarrierTest (tcu::TestContext&	testCtx,
490 											const std::string&	name,
491 											const std::string&	description,
492 											const tcu::IVec3&	localSize,
493 											const tcu::IVec3&	workSize)
494 	: TestCase		(testCtx, name, description)
495 	, m_localSize	(localSize)
496 	, m_workSize	(workSize)
497 {
498 }
499 
initPrograms(SourceCollections & sourceCollections) const500 void SSBOLocalBarrierTest::initPrograms (SourceCollections& sourceCollections) const
501 {
502 	const int workGroupSize = multiplyComponents(m_localSize);
503 	const int workGroupCount = multiplyComponents(m_workSize);
504 	const int numValues = workGroupSize * workGroupCount;
505 
506 	std::ostringstream src;
507 	src << "#version 310 es\n"
508 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
509 		<< "layout(binding = 0) coherent buffer Output {\n"
510 		<< "    uint values[" << numValues << "];\n"
511 		<< "} sb_out;\n\n"
512 		<< "void main (void) {\n"
513 		<< "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
514 		<< "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
515 		<< "    uint globalOffs = localSize*globalNdx;\n"
516 		<< "    uint localOffs  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_LocalInvocationID.z + gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x;\n"
517 		<< "\n"
518 		<< "    sb_out.values[globalOffs + localOffs] = globalOffs;\n"
519 		<< "    memoryBarrierBuffer();\n"
520 		<< "    barrier();\n"
521 		<< "    sb_out.values[globalOffs + ((localOffs+1u)%localSize)] += localOffs;\n"		// += so we read and write
522 		<< "    memoryBarrierBuffer();\n"
523 		<< "    barrier();\n"
524 		<< "    sb_out.values[globalOffs + ((localOffs+2u)%localSize)] += localOffs;\n"
525 		<< "}\n";
526 
527 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
528 }
529 
createInstance(Context & context) const530 TestInstance* SSBOLocalBarrierTest::createInstance (Context& context) const
531 {
532 	return new SSBOLocalBarrierTestInstance(context, m_localSize, m_workSize);
533 }
534 
SSBOLocalBarrierTestInstance(Context & context,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)535 SSBOLocalBarrierTestInstance::SSBOLocalBarrierTestInstance (Context& context, const tcu::IVec3& localSize, const tcu::IVec3& workSize)
536 	: TestInstance	(context)
537 	, m_localSize	(localSize)
538 	, m_workSize	(workSize)
539 {
540 }
541 
iterate(void)542 tcu::TestStatus SSBOLocalBarrierTestInstance::iterate (void)
543 {
544 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
545 	const VkDevice			device				= m_context.getDevice();
546 	const VkQueue			queue				= m_context.getUniversalQueue();
547 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
548 	Allocator&				allocator			= m_context.getDefaultAllocator();
549 
550 	const int workGroupSize = multiplyComponents(m_localSize);
551 	const int workGroupCount = multiplyComponents(m_workSize);
552 
553 	// Create a buffer and host-visible memory for it
554 
555 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * workGroupSize * workGroupCount;
556 	const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
557 
558 	// Create descriptor set
559 
560 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
561 		DescriptorSetLayoutBuilder()
562 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
563 		.build(vk, device));
564 
565 	const Unique<VkDescriptorPool> descriptorPool(
566 		DescriptorPoolBuilder()
567 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
568 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
569 
570 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
571 
572 	const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
573 	DescriptorSetUpdateBuilder()
574 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
575 		.update(vk, device);
576 
577 	// Perform the computation
578 
579 	const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
580 	const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
581 	const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
582 
583 	const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
584 
585 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
586 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
587 
588 	// Start recording commands
589 
590 	beginCommandBuffer(vk, *cmdBuffer);
591 
592 	vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
593 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
594 
595 	vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
596 
597 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
598 
599 	endCommandBuffer(vk, *cmdBuffer);
600 
601 	// Wait for completion
602 
603 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
604 
605 	// Validate the results
606 
607 	const Allocation& bufferAllocation = buffer.getAllocation();
608 	invalidateAlloc(vk, device, bufferAllocation);
609 
610 	const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
611 
612 	for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
613 	{
614 		const int globalOffset = groupNdx * workGroupSize;
615 		for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
616 		{
617 			const deUint32	res		= bufferPtr[globalOffset + localOffset];
618 			const int		offs0	= localOffset - 1 < 0 ? ((localOffset + workGroupSize - 1) % workGroupSize) : ((localOffset - 1) % workGroupSize);
619 			const int		offs1	= localOffset - 2 < 0 ? ((localOffset + workGroupSize - 2) % workGroupSize) : ((localOffset - 2) % workGroupSize);
620 			const deUint32	ref		= static_cast<deUint32>(globalOffset + offs0 + offs1);
621 
622 			if (res != ref)
623 			{
624 				std::ostringstream msg;
625 				msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
626 				return tcu::TestStatus::fail(msg.str());
627 			}
628 		}
629 	}
630 	return tcu::TestStatus::pass("Compute succeeded");
631 }
632 
633 class CopyImageToSSBOTest : public vkt::TestCase
634 {
635 public:
636 						CopyImageToSSBOTest		(tcu::TestContext&	testCtx,
637 												 const std::string&	name,
638 												 const std::string&	description,
639 												 const tcu::IVec2&	localSize,
640 												 const tcu::IVec2&	imageSize);
641 
642 	void				initPrograms			(SourceCollections& sourceCollections) const;
643 	TestInstance*		createInstance			(Context&			context) const;
644 
645 private:
646 	const tcu::IVec2	m_localSize;
647 	const tcu::IVec2	m_imageSize;
648 };
649 
650 class CopyImageToSSBOTestInstance : public vkt::TestInstance
651 {
652 public:
653 									CopyImageToSSBOTestInstance		(Context&			context,
654 																	 const tcu::IVec2&	localSize,
655 																	 const tcu::IVec2&	imageSize);
656 
657 	tcu::TestStatus					iterate							(void);
658 
659 private:
660 	const tcu::IVec2				m_localSize;
661 	const tcu::IVec2				m_imageSize;
662 };
663 
CopyImageToSSBOTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize)664 CopyImageToSSBOTest::CopyImageToSSBOTest (tcu::TestContext&		testCtx,
665 										  const std::string&	name,
666 										  const std::string&	description,
667 										  const tcu::IVec2&		localSize,
668 										  const tcu::IVec2&		imageSize)
669 	: TestCase		(testCtx, name, description)
670 	, m_localSize	(localSize)
671 	, m_imageSize	(imageSize)
672 {
673 	DE_ASSERT(m_imageSize.x() % m_localSize.x() == 0);
674 	DE_ASSERT(m_imageSize.y() % m_localSize.y() == 0);
675 }
676 
initPrograms(SourceCollections & sourceCollections) const677 void CopyImageToSSBOTest::initPrograms (SourceCollections& sourceCollections) const
678 {
679 	std::ostringstream src;
680 	src << "#version 310 es\n"
681 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ") in;\n"
682 		<< "layout(binding = 1, r32ui) readonly uniform highp uimage2D u_srcImg;\n"
683 		<< "layout(binding = 0) writeonly buffer Output {\n"
684 		<< "    uint values[" << (m_imageSize.x() * m_imageSize.y()) << "];\n"
685 		<< "} sb_out;\n\n"
686 		<< "void main (void) {\n"
687 		<< "    uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
688 		<< "    uint value  = imageLoad(u_srcImg, ivec2(gl_GlobalInvocationID.xy)).x;\n"
689 		<< "    sb_out.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x] = value;\n"
690 		<< "}\n";
691 
692 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
693 }
694 
createInstance(Context & context) const695 TestInstance* CopyImageToSSBOTest::createInstance (Context& context) const
696 {
697 	return new CopyImageToSSBOTestInstance(context, m_localSize, m_imageSize);
698 }
699 
CopyImageToSSBOTestInstance(Context & context,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize)700 CopyImageToSSBOTestInstance::CopyImageToSSBOTestInstance (Context& context, const tcu::IVec2& localSize, const tcu::IVec2& imageSize)
701 	: TestInstance	(context)
702 	, m_localSize	(localSize)
703 	, m_imageSize	(imageSize)
704 {
705 }
706 
iterate(void)707 tcu::TestStatus CopyImageToSSBOTestInstance::iterate (void)
708 {
709 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
710 	const VkDevice			device				= m_context.getDevice();
711 	const VkQueue			queue				= m_context.getUniversalQueue();
712 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
713 	Allocator&				allocator			= m_context.getDefaultAllocator();
714 
715 	// Create an image
716 
717 	const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
718 	const Image image(vk, device, allocator, imageParams, MemoryRequirement::Any);
719 
720 	const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
721 	const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
722 
723 	// Staging buffer (source data for image)
724 
725 	const deUint32 imageArea = multiplyComponents(m_imageSize);
726 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * imageArea;
727 
728 	const Buffer stagingBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_SRC_BIT), MemoryRequirement::HostVisible);
729 
730 	// Populate the staging buffer with test data
731 	{
732 		de::Random rnd(0xab2c7);
733 		const Allocation& stagingBufferAllocation = stagingBuffer.getAllocation();
734 		deUint32* bufferPtr = static_cast<deUint32*>(stagingBufferAllocation.getHostPtr());
735 		for (deUint32 i = 0; i < imageArea; ++i)
736 			*bufferPtr++ = rnd.getUint32();
737 
738 		flushAlloc(vk, device, stagingBufferAllocation);
739 	}
740 
741 	// Create a buffer to store shader output
742 
743 	const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
744 
745 	// Create descriptor set
746 
747 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
748 		DescriptorSetLayoutBuilder()
749 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
750 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
751 		.build(vk, device));
752 
753 	const Unique<VkDescriptorPool> descriptorPool(
754 		DescriptorPoolBuilder()
755 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
756 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
757 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
758 
759 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
760 
761 	// Set the bindings
762 
763 	const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
764 	const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, bufferSizeBytes);
765 
766 	DescriptorSetUpdateBuilder()
767 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
768 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
769 		.update(vk, device);
770 
771 	// Perform the computation
772 	{
773 		const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
774 		const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
775 		const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
776 
777 		const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, bufferSizeBytes);
778 		const tcu::IVec2 workSize = m_imageSize / m_localSize;
779 
780 		// Prepare the command buffer
781 
782 		const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
783 		const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
784 
785 		// Start recording commands
786 
787 		beginCommandBuffer(vk, *cmdBuffer);
788 
789 		vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
790 		vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
791 
792 		const std::vector<VkBufferImageCopy> bufferImageCopy(1, makeBufferImageCopy(m_imageSize));
793 		copyBufferToImage(vk, *cmdBuffer, *stagingBuffer, bufferSizeBytes, bufferImageCopy, VK_IMAGE_ASPECT_COLOR_BIT, 1, 1, *image, VK_IMAGE_LAYOUT_GENERAL, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
794 
795 		vk.cmdDispatch(*cmdBuffer, workSize.x(), workSize.y(), 1u);
796 		vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
797 
798 		endCommandBuffer(vk, *cmdBuffer);
799 
800 		// Wait for completion
801 
802 		submitCommandsAndWait(vk, device, queue, *cmdBuffer);
803 	}
804 
805 	// Validate the results
806 
807 	const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
808 	invalidateAlloc(vk, device, outputBufferAllocation);
809 
810 	const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
811 	const deUint32* refBufferPtr = static_cast<deUint32*>(stagingBuffer.getAllocation().getHostPtr());
812 
813 	for (deUint32 ndx = 0; ndx < imageArea; ++ndx)
814 	{
815 		const deUint32 res = *(bufferPtr + ndx);
816 		const deUint32 ref = *(refBufferPtr + ndx);
817 
818 		if (res != ref)
819 		{
820 			std::ostringstream msg;
821 			msg << "Comparison failed for Output.values[" << ndx << "]";
822 			return tcu::TestStatus::fail(msg.str());
823 		}
824 	}
825 	return tcu::TestStatus::pass("Compute succeeded");
826 }
827 
828 class CopySSBOToImageTest : public vkt::TestCase
829 {
830 public:
831 						CopySSBOToImageTest	(tcu::TestContext&	testCtx,
832 											 const std::string&	name,
833 											 const std::string&	description,
834 											 const tcu::IVec2&	localSize,
835 											 const tcu::IVec2&	imageSize);
836 
837 	void				initPrograms		(SourceCollections& sourceCollections) const;
838 	TestInstance*		createInstance		(Context&			context) const;
839 
840 private:
841 	const tcu::IVec2	m_localSize;
842 	const tcu::IVec2	m_imageSize;
843 };
844 
845 class CopySSBOToImageTestInstance : public vkt::TestInstance
846 {
847 public:
848 									CopySSBOToImageTestInstance	(Context&			context,
849 																 const tcu::IVec2&	localSize,
850 																 const tcu::IVec2&	imageSize);
851 
852 	tcu::TestStatus					iterate						(void);
853 
854 private:
855 	const tcu::IVec2				m_localSize;
856 	const tcu::IVec2				m_imageSize;
857 };
858 
CopySSBOToImageTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize)859 CopySSBOToImageTest::CopySSBOToImageTest (tcu::TestContext&		testCtx,
860 										  const std::string&	name,
861 										  const std::string&	description,
862 										  const tcu::IVec2&		localSize,
863 										  const tcu::IVec2&		imageSize)
864 	: TestCase		(testCtx, name, description)
865 	, m_localSize	(localSize)
866 	, m_imageSize	(imageSize)
867 {
868 	DE_ASSERT(m_imageSize.x() % m_localSize.x() == 0);
869 	DE_ASSERT(m_imageSize.y() % m_localSize.y() == 0);
870 }
871 
initPrograms(SourceCollections & sourceCollections) const872 void CopySSBOToImageTest::initPrograms (SourceCollections& sourceCollections) const
873 {
874 	std::ostringstream src;
875 	src << "#version 310 es\n"
876 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ") in;\n"
877 		<< "layout(binding = 1, r32ui) writeonly uniform highp uimage2D u_dstImg;\n"
878 		<< "layout(binding = 0) readonly buffer Input {\n"
879 		<< "    uint values[" << (m_imageSize.x() * m_imageSize.y()) << "];\n"
880 		<< "} sb_in;\n\n"
881 		<< "void main (void) {\n"
882 		<< "    uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
883 		<< "    uint value  = sb_in.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x];\n"
884 		<< "    imageStore(u_dstImg, ivec2(gl_GlobalInvocationID.xy), uvec4(value, 0, 0, 0));\n"
885 		<< "}\n";
886 
887 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
888 }
889 
createInstance(Context & context) const890 TestInstance* CopySSBOToImageTest::createInstance (Context& context) const
891 {
892 	return new CopySSBOToImageTestInstance(context, m_localSize, m_imageSize);
893 }
894 
CopySSBOToImageTestInstance(Context & context,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize)895 CopySSBOToImageTestInstance::CopySSBOToImageTestInstance (Context& context, const tcu::IVec2& localSize, const tcu::IVec2& imageSize)
896 	: TestInstance	(context)
897 	, m_localSize	(localSize)
898 	, m_imageSize	(imageSize)
899 {
900 }
901 
iterate(void)902 tcu::TestStatus CopySSBOToImageTestInstance::iterate (void)
903 {
904 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
905 	const VkDevice			device				= m_context.getDevice();
906 	const VkQueue			queue				= m_context.getUniversalQueue();
907 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
908 	Allocator&				allocator			= m_context.getDefaultAllocator();
909 
910 	// Create an image
911 
912 	const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
913 	const Image image(vk, device, allocator, imageParams, MemoryRequirement::Any);
914 
915 	const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
916 	const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
917 
918 	// Create an input buffer (data to be read in the shader)
919 
920 	const deUint32 imageArea = multiplyComponents(m_imageSize);
921 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * imageArea;
922 
923 	const Buffer inputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
924 
925 	// Populate the buffer with test data
926 	{
927 		de::Random rnd(0x77238ac2);
928 		const Allocation& inputBufferAllocation = inputBuffer.getAllocation();
929 		deUint32* bufferPtr = static_cast<deUint32*>(inputBufferAllocation.getHostPtr());
930 		for (deUint32 i = 0; i < imageArea; ++i)
931 			*bufferPtr++ = rnd.getUint32();
932 
933 		flushAlloc(vk, device, inputBufferAllocation);
934 	}
935 
936 	// Create a buffer to store shader output (copied from image data)
937 
938 	const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT), MemoryRequirement::HostVisible);
939 
940 	// Create descriptor set
941 
942 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
943 		DescriptorSetLayoutBuilder()
944 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
945 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
946 		.build(vk, device));
947 
948 	const Unique<VkDescriptorPool> descriptorPool(
949 		DescriptorPoolBuilder()
950 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
951 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
952 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
953 
954 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
955 
956 	// Set the bindings
957 
958 	const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
959 	const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, bufferSizeBytes);
960 
961 	DescriptorSetUpdateBuilder()
962 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
963 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
964 		.update(vk, device);
965 
966 	// Perform the computation
967 	{
968 		const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
969 		const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
970 		const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
971 
972 		const VkBufferMemoryBarrier inputBufferPostHostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, bufferSizeBytes);
973 
974 		const VkImageMemoryBarrier imageLayoutBarrier = makeImageMemoryBarrier(
975 			0u, VK_ACCESS_SHADER_WRITE_BIT,
976 			VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
977 			*image, subresourceRange);
978 
979 		const tcu::IVec2 workSize = m_imageSize / m_localSize;
980 
981 		// Prepare the command buffer
982 
983 		const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
984 		const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
985 
986 		// Start recording commands
987 
988 		beginCommandBuffer(vk, *cmdBuffer);
989 
990 		vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
991 		vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
992 
993 		vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &inputBufferPostHostWriteBarrier, 1, &imageLayoutBarrier);
994 		vk.cmdDispatch(*cmdBuffer, workSize.x(), workSize.y(), 1u);
995 
996 		copyImageToBuffer(vk, *cmdBuffer, *image, *outputBuffer, m_imageSize, VK_ACCESS_SHADER_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL);
997 
998 		endCommandBuffer(vk, *cmdBuffer);
999 
1000 		// Wait for completion
1001 
1002 		submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1003 	}
1004 
1005 	// Validate the results
1006 
1007 	const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1008 	invalidateAlloc(vk, device, outputBufferAllocation);
1009 
1010 	const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
1011 	const deUint32* refBufferPtr = static_cast<deUint32*>(inputBuffer.getAllocation().getHostPtr());
1012 
1013 	for (deUint32 ndx = 0; ndx < imageArea; ++ndx)
1014 	{
1015 		const deUint32 res = *(bufferPtr + ndx);
1016 		const deUint32 ref = *(refBufferPtr + ndx);
1017 
1018 		if (res != ref)
1019 		{
1020 			std::ostringstream msg;
1021 			msg << "Comparison failed for pixel " << ndx;
1022 			return tcu::TestStatus::fail(msg.str());
1023 		}
1024 	}
1025 	return tcu::TestStatus::pass("Compute succeeded");
1026 }
1027 
1028 class BufferToBufferInvertTest : public vkt::TestCase
1029 {
1030 public:
1031 	void								initPrograms				(SourceCollections&	sourceCollections) const;
1032 	TestInstance*						createInstance				(Context&			context) const;
1033 
1034 	static BufferToBufferInvertTest*	UBOToSSBOInvertCase			(tcu::TestContext&	testCtx,
1035 																	 const std::string& name,
1036 																	 const std::string& description,
1037 																	 const deUint32		numValues,
1038 																	 const tcu::IVec3&	localSize,
1039 																	 const tcu::IVec3&	workSize);
1040 
1041 	static BufferToBufferInvertTest*	CopyInvertSSBOCase			(tcu::TestContext&	testCtx,
1042 																	 const std::string& name,
1043 																	 const std::string& description,
1044 																	 const deUint32		numValues,
1045 																	 const tcu::IVec3&	localSize,
1046 																	 const tcu::IVec3&	workSize);
1047 
1048 private:
1049 										BufferToBufferInvertTest	(tcu::TestContext&	testCtx,
1050 																	 const std::string& name,
1051 																	 const std::string& description,
1052 																	 const deUint32		numValues,
1053 																	 const tcu::IVec3&	localSize,
1054 																	 const tcu::IVec3&	workSize,
1055 																	 const BufferType	bufferType);
1056 
1057 	const BufferType					m_bufferType;
1058 	const deUint32						m_numValues;
1059 	const tcu::IVec3					m_localSize;
1060 	const tcu::IVec3					m_workSize;
1061 };
1062 
1063 class BufferToBufferInvertTestInstance : public vkt::TestInstance
1064 {
1065 public:
1066 									BufferToBufferInvertTestInstance	(Context&			context,
1067 																		 const deUint32		numValues,
1068 																		 const tcu::IVec3&	localSize,
1069 																		 const tcu::IVec3&	workSize,
1070 																		 const BufferType	bufferType);
1071 
1072 	tcu::TestStatus					iterate								(void);
1073 
1074 private:
1075 	const BufferType				m_bufferType;
1076 	const deUint32					m_numValues;
1077 	const tcu::IVec3				m_localSize;
1078 	const tcu::IVec3				m_workSize;
1079 };
1080 
BufferToBufferInvertTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const BufferType bufferType)1081 BufferToBufferInvertTest::BufferToBufferInvertTest (tcu::TestContext&	testCtx,
1082 													const std::string&	name,
1083 													const std::string&	description,
1084 													const deUint32		numValues,
1085 													const tcu::IVec3&	localSize,
1086 													const tcu::IVec3&	workSize,
1087 													const BufferType	bufferType)
1088 	: TestCase		(testCtx, name, description)
1089 	, m_bufferType	(bufferType)
1090 	, m_numValues	(numValues)
1091 	, m_localSize	(localSize)
1092 	, m_workSize	(workSize)
1093 {
1094 	DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1095 	DE_ASSERT(m_bufferType == BUFFER_TYPE_UNIFORM || m_bufferType == BUFFER_TYPE_SSBO);
1096 }
1097 
UBOToSSBOInvertCase(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)1098 BufferToBufferInvertTest* BufferToBufferInvertTest::UBOToSSBOInvertCase (tcu::TestContext&	testCtx,
1099 																		 const std::string&	name,
1100 																		 const std::string&	description,
1101 																		 const deUint32		numValues,
1102 																		 const tcu::IVec3&	localSize,
1103 																		 const tcu::IVec3&	workSize)
1104 {
1105 	return new BufferToBufferInvertTest(testCtx, name, description, numValues, localSize, workSize, BUFFER_TYPE_UNIFORM);
1106 }
1107 
CopyInvertSSBOCase(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)1108 BufferToBufferInvertTest* BufferToBufferInvertTest::CopyInvertSSBOCase (tcu::TestContext&	testCtx,
1109 																		const std::string&	name,
1110 																		const std::string&	description,
1111 																		const deUint32		numValues,
1112 																		const tcu::IVec3&	localSize,
1113 																		const tcu::IVec3&	workSize)
1114 {
1115 	return new BufferToBufferInvertTest(testCtx, name, description, numValues, localSize, workSize, BUFFER_TYPE_SSBO);
1116 }
1117 
initPrograms(SourceCollections & sourceCollections) const1118 void BufferToBufferInvertTest::initPrograms (SourceCollections& sourceCollections) const
1119 {
1120 	std::ostringstream src;
1121 	if (m_bufferType == BUFFER_TYPE_UNIFORM)
1122 	{
1123 		src << "#version 310 es\n"
1124 			<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1125 			<< "layout(binding = 0) readonly uniform Input {\n"
1126 			<< "    uint values[" << m_numValues << "];\n"
1127 			<< "} ub_in;\n"
1128 			<< "layout(binding = 1, std140) writeonly buffer Output {\n"
1129 			<< "    uint values[" << m_numValues << "];\n"
1130 			<< "} sb_out;\n"
1131 			<< "void main (void) {\n"
1132 			<< "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1133 			<< "    uint numValuesPerInv = uint(ub_in.values.length()) / (size.x*size.y*size.z);\n"
1134 			<< "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1135 			<< "    uint offset          = numValuesPerInv*groupNdx;\n"
1136 			<< "\n"
1137 			<< "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1138 			<< "        sb_out.values[offset + ndx] = ~ub_in.values[offset + ndx];\n"
1139 			<< "}\n";
1140 	}
1141 	else if (m_bufferType == BUFFER_TYPE_SSBO)
1142 	{
1143 		src << "#version 310 es\n"
1144 			<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1145 			<< "layout(binding = 0, std140) readonly buffer Input {\n"
1146 			<< "    uint values[" << m_numValues << "];\n"
1147 			<< "} sb_in;\n"
1148 			<< "layout (binding = 1, std140) writeonly buffer Output {\n"
1149 			<< "    uint values[" << m_numValues << "];\n"
1150 			<< "} sb_out;\n"
1151 			<< "void main (void) {\n"
1152 			<< "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1153 			<< "    uint numValuesPerInv = uint(sb_in.values.length()) / (size.x*size.y*size.z);\n"
1154 			<< "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1155 			<< "    uint offset          = numValuesPerInv*groupNdx;\n"
1156 			<< "\n"
1157 			<< "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1158 			<< "        sb_out.values[offset + ndx] = ~sb_in.values[offset + ndx];\n"
1159 			<< "}\n";
1160 	}
1161 
1162 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1163 }
1164 
createInstance(Context & context) const1165 TestInstance* BufferToBufferInvertTest::createInstance (Context& context) const
1166 {
1167 	return new BufferToBufferInvertTestInstance(context, m_numValues, m_localSize, m_workSize, m_bufferType);
1168 }
1169 
BufferToBufferInvertTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const BufferType bufferType)1170 BufferToBufferInvertTestInstance::BufferToBufferInvertTestInstance (Context&			context,
1171 																	const deUint32		numValues,
1172 																	const tcu::IVec3&	localSize,
1173 																	const tcu::IVec3&	workSize,
1174 																	const BufferType	bufferType)
1175 	: TestInstance	(context)
1176 	, m_bufferType	(bufferType)
1177 	, m_numValues	(numValues)
1178 	, m_localSize	(localSize)
1179 	, m_workSize	(workSize)
1180 {
1181 }
1182 
iterate(void)1183 tcu::TestStatus BufferToBufferInvertTestInstance::iterate (void)
1184 {
1185 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
1186 	const VkDevice			device				= m_context.getDevice();
1187 	const VkQueue			queue				= m_context.getUniversalQueue();
1188 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
1189 	Allocator&				allocator			= m_context.getDefaultAllocator();
1190 
1191 	// Customize the test based on buffer type
1192 
1193 	const VkBufferUsageFlags inputBufferUsageFlags		= (m_bufferType == BUFFER_TYPE_UNIFORM ? VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT : VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
1194 	const VkDescriptorType inputBufferDescriptorType	= (m_bufferType == BUFFER_TYPE_UNIFORM ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1195 	const deUint32 randomSeed							= (m_bufferType == BUFFER_TYPE_UNIFORM ? 0x111223f : 0x124fef);
1196 
1197 	// Create an input buffer
1198 
1199 	const VkDeviceSize bufferSizeBytes = sizeof(tcu::UVec4) * m_numValues;
1200 	const Buffer inputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, inputBufferUsageFlags), MemoryRequirement::HostVisible);
1201 
1202 	// Fill the input buffer with data
1203 	{
1204 		de::Random rnd(randomSeed);
1205 		const Allocation& inputBufferAllocation = inputBuffer.getAllocation();
1206 		tcu::UVec4* bufferPtr = static_cast<tcu::UVec4*>(inputBufferAllocation.getHostPtr());
1207 		for (deUint32 i = 0; i < m_numValues; ++i)
1208 			bufferPtr[i].x() = rnd.getUint32();
1209 
1210 		flushAlloc(vk, device, inputBufferAllocation);
1211 	}
1212 
1213 	// Create an output buffer
1214 
1215 	const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1216 
1217 	// Create descriptor set
1218 
1219 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1220 		DescriptorSetLayoutBuilder()
1221 		.addSingleBinding(inputBufferDescriptorType, VK_SHADER_STAGE_COMPUTE_BIT)
1222 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1223 		.build(vk, device));
1224 
1225 	const Unique<VkDescriptorPool> descriptorPool(
1226 		DescriptorPoolBuilder()
1227 		.addType(inputBufferDescriptorType)
1228 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
1229 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1230 
1231 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1232 
1233 	const VkDescriptorBufferInfo inputBufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, bufferSizeBytes);
1234 	const VkDescriptorBufferInfo outputBufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, bufferSizeBytes);
1235 	DescriptorSetUpdateBuilder()
1236 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), inputBufferDescriptorType, &inputBufferDescriptorInfo)
1237 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
1238 		.update(vk, device);
1239 
1240 	// Perform the computation
1241 
1242 	const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
1243 	const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1244 	const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
1245 
1246 	const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, bufferSizeBytes);
1247 
1248 	const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, bufferSizeBytes);
1249 
1250 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1251 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1252 
1253 	// Start recording commands
1254 
1255 	beginCommandBuffer(vk, *cmdBuffer);
1256 
1257 	vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1258 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1259 
1260 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1261 	vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1262 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1263 
1264 	endCommandBuffer(vk, *cmdBuffer);
1265 
1266 	// Wait for completion
1267 
1268 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1269 
1270 	// Validate the results
1271 
1272 	const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1273 	invalidateAlloc(vk, device, outputBufferAllocation);
1274 
1275 	const tcu::UVec4* bufferPtr = static_cast<tcu::UVec4*>(outputBufferAllocation.getHostPtr());
1276 	const tcu::UVec4* refBufferPtr = static_cast<tcu::UVec4*>(inputBuffer.getAllocation().getHostPtr());
1277 
1278 	for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1279 	{
1280 		const deUint32 res = bufferPtr[ndx].x();
1281 		const deUint32 ref = ~refBufferPtr[ndx].x();
1282 
1283 		if (res != ref)
1284 		{
1285 			std::ostringstream msg;
1286 			msg << "Comparison failed for Output.values[" << ndx << "]";
1287 			return tcu::TestStatus::fail(msg.str());
1288 		}
1289 	}
1290 	return tcu::TestStatus::pass("Compute succeeded");
1291 }
1292 
1293 class InvertSSBOInPlaceTest : public vkt::TestCase
1294 {
1295 public:
1296 						InvertSSBOInPlaceTest	(tcu::TestContext&	testCtx,
1297 												 const std::string&	name,
1298 												 const std::string&	description,
1299 												 const deUint32		numValues,
1300 												 const bool			sized,
1301 												 const tcu::IVec3&	localSize,
1302 												 const tcu::IVec3&	workSize);
1303 
1304 
1305 	void				initPrograms			(SourceCollections& sourceCollections) const;
1306 	TestInstance*		createInstance			(Context&			context) const;
1307 
1308 private:
1309 	const deUint32		m_numValues;
1310 	const bool			m_sized;
1311 	const tcu::IVec3	m_localSize;
1312 	const tcu::IVec3	m_workSize;
1313 };
1314 
1315 class InvertSSBOInPlaceTestInstance : public vkt::TestInstance
1316 {
1317 public:
1318 									InvertSSBOInPlaceTestInstance	(Context&			context,
1319 																	 const deUint32		numValues,
1320 																	 const tcu::IVec3&	localSize,
1321 																	 const tcu::IVec3&	workSize);
1322 
1323 	tcu::TestStatus					iterate							(void);
1324 
1325 private:
1326 	const deUint32					m_numValues;
1327 	const tcu::IVec3				m_localSize;
1328 	const tcu::IVec3				m_workSize;
1329 };
1330 
InvertSSBOInPlaceTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const bool sized,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)1331 InvertSSBOInPlaceTest::InvertSSBOInPlaceTest (tcu::TestContext&		testCtx,
1332 											  const std::string&	name,
1333 											  const std::string&	description,
1334 											  const deUint32		numValues,
1335 											  const bool			sized,
1336 											  const tcu::IVec3&		localSize,
1337 											  const tcu::IVec3&		workSize)
1338 	: TestCase		(testCtx, name, description)
1339 	, m_numValues	(numValues)
1340 	, m_sized		(sized)
1341 	, m_localSize	(localSize)
1342 	, m_workSize	(workSize)
1343 {
1344 	DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1345 }
1346 
initPrograms(SourceCollections & sourceCollections) const1347 void InvertSSBOInPlaceTest::initPrograms (SourceCollections& sourceCollections) const
1348 {
1349 	std::ostringstream src;
1350 	src << "#version 310 es\n"
1351 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1352 		<< "layout(binding = 0) buffer InOut {\n"
1353 		<< "    uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1354 		<< "} sb_inout;\n"
1355 		<< "void main (void) {\n"
1356 		<< "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1357 		<< "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
1358 		<< "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1359 		<< "    uint offset          = numValuesPerInv*groupNdx;\n"
1360 		<< "\n"
1361 		<< "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1362 		<< "        sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
1363 		<< "}\n";
1364 
1365 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1366 }
1367 
createInstance(Context & context) const1368 TestInstance* InvertSSBOInPlaceTest::createInstance (Context& context) const
1369 {
1370 	return new InvertSSBOInPlaceTestInstance(context, m_numValues, m_localSize, m_workSize);
1371 }
1372 
InvertSSBOInPlaceTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)1373 InvertSSBOInPlaceTestInstance::InvertSSBOInPlaceTestInstance (Context&			context,
1374 															  const deUint32	numValues,
1375 															  const tcu::IVec3&	localSize,
1376 															  const tcu::IVec3&	workSize)
1377 	: TestInstance	(context)
1378 	, m_numValues	(numValues)
1379 	, m_localSize	(localSize)
1380 	, m_workSize	(workSize)
1381 {
1382 }
1383 
iterate(void)1384 tcu::TestStatus InvertSSBOInPlaceTestInstance::iterate (void)
1385 {
1386 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
1387 	const VkDevice			device				= m_context.getDevice();
1388 	const VkQueue			queue				= m_context.getUniversalQueue();
1389 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
1390 	Allocator&				allocator			= m_context.getDefaultAllocator();
1391 
1392 	// Create an input/output buffer
1393 
1394 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
1395 	const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1396 
1397 	// Fill the buffer with data
1398 
1399 	typedef std::vector<deUint32> data_vector_t;
1400 	data_vector_t inputData(m_numValues);
1401 
1402 	{
1403 		de::Random rnd(0x82ce7f);
1404 		const Allocation& bufferAllocation = buffer.getAllocation();
1405 		deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
1406 		for (deUint32 i = 0; i < m_numValues; ++i)
1407 			inputData[i] = *bufferPtr++ = rnd.getUint32();
1408 
1409 		flushAlloc(vk, device, bufferAllocation);
1410 	}
1411 
1412 	// Create descriptor set
1413 
1414 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1415 		DescriptorSetLayoutBuilder()
1416 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1417 		.build(vk, device));
1418 
1419 	const Unique<VkDescriptorPool> descriptorPool(
1420 		DescriptorPoolBuilder()
1421 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
1422 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1423 
1424 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1425 
1426 	const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
1427 	DescriptorSetUpdateBuilder()
1428 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
1429 		.update(vk, device);
1430 
1431 	// Perform the computation
1432 
1433 	const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
1434 	const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1435 	const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
1436 
1437 	const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer, 0ull, bufferSizeBytes);
1438 
1439 	const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
1440 
1441 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1442 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1443 
1444 	// Start recording commands
1445 
1446 	beginCommandBuffer(vk, *cmdBuffer);
1447 
1448 	vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1449 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1450 
1451 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1452 	vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1453 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1454 
1455 	endCommandBuffer(vk, *cmdBuffer);
1456 
1457 	// Wait for completion
1458 
1459 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1460 
1461 	// Validate the results
1462 
1463 	const Allocation& bufferAllocation = buffer.getAllocation();
1464 	invalidateAlloc(vk, device, bufferAllocation);
1465 
1466 	const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
1467 
1468 	for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1469 	{
1470 		const deUint32 res = bufferPtr[ndx];
1471 		const deUint32 ref = ~inputData[ndx];
1472 
1473 		if (res != ref)
1474 		{
1475 			std::ostringstream msg;
1476 			msg << "Comparison failed for InOut.values[" << ndx << "]";
1477 			return tcu::TestStatus::fail(msg.str());
1478 		}
1479 	}
1480 	return tcu::TestStatus::pass("Compute succeeded");
1481 }
1482 
1483 class WriteToMultipleSSBOTest : public vkt::TestCase
1484 {
1485 public:
1486 						WriteToMultipleSSBOTest	(tcu::TestContext&	testCtx,
1487 												 const std::string&	name,
1488 												 const std::string&	description,
1489 												 const deUint32		numValues,
1490 												 const bool			sized,
1491 												 const tcu::IVec3&	localSize,
1492 												 const tcu::IVec3&	workSize);
1493 
1494 	void				initPrograms			(SourceCollections& sourceCollections) const;
1495 	TestInstance*		createInstance			(Context&			context) const;
1496 
1497 private:
1498 	const deUint32		m_numValues;
1499 	const bool			m_sized;
1500 	const tcu::IVec3	m_localSize;
1501 	const tcu::IVec3	m_workSize;
1502 };
1503 
1504 class WriteToMultipleSSBOTestInstance : public vkt::TestInstance
1505 {
1506 public:
1507 									WriteToMultipleSSBOTestInstance	(Context&			context,
1508 																	 const deUint32		numValues,
1509 																	 const tcu::IVec3&	localSize,
1510 																	 const tcu::IVec3&	workSize);
1511 
1512 	tcu::TestStatus					iterate							(void);
1513 
1514 private:
1515 	const deUint32					m_numValues;
1516 	const tcu::IVec3				m_localSize;
1517 	const tcu::IVec3				m_workSize;
1518 };
1519 
WriteToMultipleSSBOTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const bool sized,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)1520 WriteToMultipleSSBOTest::WriteToMultipleSSBOTest (tcu::TestContext&		testCtx,
1521 												  const std::string&	name,
1522 												  const std::string&	description,
1523 												  const deUint32		numValues,
1524 												  const bool			sized,
1525 												  const tcu::IVec3&		localSize,
1526 												  const tcu::IVec3&		workSize)
1527 	: TestCase		(testCtx, name, description)
1528 	, m_numValues	(numValues)
1529 	, m_sized		(sized)
1530 	, m_localSize	(localSize)
1531 	, m_workSize	(workSize)
1532 {
1533 	DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1534 }
1535 
initPrograms(SourceCollections & sourceCollections) const1536 void WriteToMultipleSSBOTest::initPrograms (SourceCollections& sourceCollections) const
1537 {
1538 	std::ostringstream src;
1539 	src << "#version 310 es\n"
1540 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1541 		<< "layout(binding = 0) writeonly buffer Out0 {\n"
1542 		<< "    uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1543 		<< "} sb_out0;\n"
1544 		<< "layout(binding = 1) writeonly buffer Out1 {\n"
1545 		<< "    uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1546 		<< "} sb_out1;\n"
1547 		<< "void main (void) {\n"
1548 		<< "    uvec3 size      = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1549 		<< "    uint groupNdx   = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1550 		<< "\n"
1551 		<< "    {\n"
1552 		<< "        uint numValuesPerInv = uint(sb_out0.values.length()) / (size.x*size.y*size.z);\n"
1553 		<< "        uint offset          = numValuesPerInv*groupNdx;\n"
1554 		<< "\n"
1555 		<< "        for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1556 		<< "            sb_out0.values[offset + ndx] = offset + ndx;\n"
1557 		<< "    }\n"
1558 		<< "    {\n"
1559 		<< "        uint numValuesPerInv = uint(sb_out1.values.length()) / (size.x*size.y*size.z);\n"
1560 		<< "        uint offset          = numValuesPerInv*groupNdx;\n"
1561 		<< "\n"
1562 		<< "        for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1563 		<< "            sb_out1.values[offset + ndx] = uint(sb_out1.values.length()) - offset - ndx;\n"
1564 		<< "    }\n"
1565 		<< "}\n";
1566 
1567 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1568 }
1569 
createInstance(Context & context) const1570 TestInstance* WriteToMultipleSSBOTest::createInstance (Context& context) const
1571 {
1572 	return new WriteToMultipleSSBOTestInstance(context, m_numValues, m_localSize, m_workSize);
1573 }
1574 
WriteToMultipleSSBOTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)1575 WriteToMultipleSSBOTestInstance::WriteToMultipleSSBOTestInstance (Context&			context,
1576 																  const deUint32	numValues,
1577 																  const tcu::IVec3&	localSize,
1578 																  const tcu::IVec3&	workSize)
1579 	: TestInstance	(context)
1580 	, m_numValues	(numValues)
1581 	, m_localSize	(localSize)
1582 	, m_workSize	(workSize)
1583 {
1584 }
1585 
iterate(void)1586 tcu::TestStatus WriteToMultipleSSBOTestInstance::iterate (void)
1587 {
1588 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
1589 	const VkDevice			device				= m_context.getDevice();
1590 	const VkQueue			queue				= m_context.getUniversalQueue();
1591 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
1592 	Allocator&				allocator			= m_context.getDefaultAllocator();
1593 
1594 	// Create two output buffers
1595 
1596 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
1597 	const Buffer buffer0(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1598 	const Buffer buffer1(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1599 
1600 	// Create descriptor set
1601 
1602 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1603 		DescriptorSetLayoutBuilder()
1604 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1605 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1606 		.build(vk, device));
1607 
1608 	const Unique<VkDescriptorPool> descriptorPool(
1609 		DescriptorPoolBuilder()
1610 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2u)
1611 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1612 
1613 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1614 
1615 	const VkDescriptorBufferInfo buffer0DescriptorInfo = makeDescriptorBufferInfo(*buffer0, 0ull, bufferSizeBytes);
1616 	const VkDescriptorBufferInfo buffer1DescriptorInfo = makeDescriptorBufferInfo(*buffer1, 0ull, bufferSizeBytes);
1617 	DescriptorSetUpdateBuilder()
1618 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &buffer0DescriptorInfo)
1619 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &buffer1DescriptorInfo)
1620 		.update(vk, device);
1621 
1622 	// Perform the computation
1623 
1624 	const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
1625 	const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1626 	const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
1627 
1628 	const VkBufferMemoryBarrier shaderWriteBarriers[] =
1629 	{
1630 		makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer0, 0ull, bufferSizeBytes),
1631 		makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer1, 0ull, bufferSizeBytes)
1632 	};
1633 
1634 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1635 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1636 
1637 	// Start recording commands
1638 
1639 	beginCommandBuffer(vk, *cmdBuffer);
1640 
1641 	vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1642 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1643 
1644 	vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1645 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, DE_LENGTH_OF_ARRAY(shaderWriteBarriers), shaderWriteBarriers, 0, (const VkImageMemoryBarrier*)DE_NULL);
1646 
1647 	endCommandBuffer(vk, *cmdBuffer);
1648 
1649 	// Wait for completion
1650 
1651 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1652 
1653 	// Validate the results
1654 	{
1655 		const Allocation& buffer0Allocation = buffer0.getAllocation();
1656 		invalidateAlloc(vk, device, buffer0Allocation);
1657 		const deUint32* buffer0Ptr = static_cast<deUint32*>(buffer0Allocation.getHostPtr());
1658 
1659 		for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1660 		{
1661 			const deUint32 res = buffer0Ptr[ndx];
1662 			const deUint32 ref = ndx;
1663 
1664 			if (res != ref)
1665 			{
1666 				std::ostringstream msg;
1667 				msg << "Comparison failed for Out0.values[" << ndx << "] res=" << res << " ref=" << ref;
1668 				return tcu::TestStatus::fail(msg.str());
1669 			}
1670 		}
1671 	}
1672 	{
1673 		const Allocation& buffer1Allocation = buffer1.getAllocation();
1674 		invalidateAlloc(vk, device, buffer1Allocation);
1675 		const deUint32* buffer1Ptr = static_cast<deUint32*>(buffer1Allocation.getHostPtr());
1676 
1677 		for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1678 		{
1679 			const deUint32 res = buffer1Ptr[ndx];
1680 			const deUint32 ref = m_numValues - ndx;
1681 
1682 			if (res != ref)
1683 			{
1684 				std::ostringstream msg;
1685 				msg << "Comparison failed for Out1.values[" << ndx << "] res=" << res << " ref=" << ref;
1686 				return tcu::TestStatus::fail(msg.str());
1687 			}
1688 		}
1689 	}
1690 	return tcu::TestStatus::pass("Compute succeeded");
1691 }
1692 
1693 class SSBOBarrierTest : public vkt::TestCase
1694 {
1695 public:
1696 						SSBOBarrierTest		(tcu::TestContext&	testCtx,
1697 											 const std::string&	name,
1698 											 const std::string&	description,
1699 											 const tcu::IVec3&	workSize);
1700 
1701 	void				initPrograms		(SourceCollections& sourceCollections) const;
1702 	TestInstance*		createInstance		(Context&			context) const;
1703 
1704 private:
1705 	const tcu::IVec3	m_workSize;
1706 };
1707 
1708 class SSBOBarrierTestInstance : public vkt::TestInstance
1709 {
1710 public:
1711 									SSBOBarrierTestInstance		(Context&			context,
1712 																 const tcu::IVec3&	workSize);
1713 
1714 	tcu::TestStatus					iterate						(void);
1715 
1716 private:
1717 	const tcu::IVec3				m_workSize;
1718 };
1719 
SSBOBarrierTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec3 & workSize)1720 SSBOBarrierTest::SSBOBarrierTest (tcu::TestContext&		testCtx,
1721 								  const std::string&	name,
1722 								  const std::string&	description,
1723 								  const tcu::IVec3&		workSize)
1724 	: TestCase		(testCtx, name, description)
1725 	, m_workSize	(workSize)
1726 {
1727 }
1728 
initPrograms(SourceCollections & sourceCollections) const1729 void SSBOBarrierTest::initPrograms (SourceCollections& sourceCollections) const
1730 {
1731 	sourceCollections.glslSources.add("comp0") << glu::ComputeSource(
1732 		"#version 310 es\n"
1733 		"layout (local_size_x = 1) in;\n"
1734 		"layout(binding = 2) readonly uniform Constants {\n"
1735 		"    uint u_baseVal;\n"
1736 		"};\n"
1737 		"layout(binding = 1) writeonly buffer Output {\n"
1738 		"    uint values[];\n"
1739 		"};\n"
1740 		"void main (void) {\n"
1741 		"    uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
1742 		"    values[offset] = u_baseVal + offset;\n"
1743 		"}\n");
1744 
1745 	sourceCollections.glslSources.add("comp1") << glu::ComputeSource(
1746 		"#version 310 es\n"
1747 		"layout (local_size_x = 1) in;\n"
1748 		"layout(binding = 1) readonly buffer Input {\n"
1749 		"    uint values[];\n"
1750 		"};\n"
1751 		"layout(binding = 0) coherent buffer Output {\n"
1752 		"    uint sum;\n"
1753 		"};\n"
1754 		"void main (void) {\n"
1755 		"    uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
1756 		"    uint value  = values[offset];\n"
1757 		"    atomicAdd(sum, value);\n"
1758 		"}\n");
1759 }
1760 
createInstance(Context & context) const1761 TestInstance* SSBOBarrierTest::createInstance (Context& context) const
1762 {
1763 	return new SSBOBarrierTestInstance(context, m_workSize);
1764 }
1765 
SSBOBarrierTestInstance(Context & context,const tcu::IVec3 & workSize)1766 SSBOBarrierTestInstance::SSBOBarrierTestInstance (Context& context, const tcu::IVec3& workSize)
1767 	: TestInstance	(context)
1768 	, m_workSize	(workSize)
1769 {
1770 }
1771 
iterate(void)1772 tcu::TestStatus SSBOBarrierTestInstance::iterate (void)
1773 {
1774 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
1775 	const VkDevice			device				= m_context.getDevice();
1776 	const VkQueue			queue				= m_context.getUniversalQueue();
1777 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
1778 	Allocator&				allocator			= m_context.getDefaultAllocator();
1779 
1780 	// Create a work buffer used by both shaders
1781 
1782 	const int workGroupCount = multiplyComponents(m_workSize);
1783 	const VkDeviceSize workBufferSizeBytes = sizeof(deUint32) * workGroupCount;
1784 	const Buffer workBuffer(vk, device, allocator, makeBufferCreateInfo(workBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::Any);
1785 
1786 	// Create an output buffer
1787 
1788 	const VkDeviceSize outputBufferSizeBytes = sizeof(deUint32);
1789 	const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1790 
1791 	// Initialize atomic counter value to zero
1792 	{
1793 		const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1794 		deUint32* outputBufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
1795 		*outputBufferPtr = 0;
1796 		flushAlloc(vk, device, outputBufferAllocation);
1797 	}
1798 
1799 	// Create a uniform buffer (to pass uniform constants)
1800 
1801 	const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32);
1802 	const Buffer uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
1803 
1804 	// Set the constants in the uniform buffer
1805 
1806 	const deUint32	baseValue = 127;
1807 	{
1808 		const Allocation& uniformBufferAllocation = uniformBuffer.getAllocation();
1809 		deUint32* uniformBufferPtr = static_cast<deUint32*>(uniformBufferAllocation.getHostPtr());
1810 		uniformBufferPtr[0] = baseValue;
1811 
1812 		flushAlloc(vk, device, uniformBufferAllocation);
1813 	}
1814 
1815 	// Create descriptor set
1816 
1817 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1818 		DescriptorSetLayoutBuilder()
1819 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1820 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1821 		.addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1822 		.build(vk, device));
1823 
1824 	const Unique<VkDescriptorPool> descriptorPool(
1825 		DescriptorPoolBuilder()
1826 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2u)
1827 		.addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
1828 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1829 
1830 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1831 
1832 	const VkDescriptorBufferInfo workBufferDescriptorInfo = makeDescriptorBufferInfo(*workBuffer, 0ull, workBufferSizeBytes);
1833 	const VkDescriptorBufferInfo outputBufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, outputBufferSizeBytes);
1834 	const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
1835 	DescriptorSetUpdateBuilder()
1836 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
1837 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &workBufferDescriptorInfo)
1838 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(2u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
1839 		.update(vk, device);
1840 
1841 	// Perform the computation
1842 
1843 	const Unique<VkShaderModule> shaderModule0(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp0"), 0));
1844 	const Unique<VkShaderModule> shaderModule1(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp1"), 0));
1845 
1846 	const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1847 	const Unique<VkPipeline> pipeline0(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule0));
1848 	const Unique<VkPipeline> pipeline1(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule1));
1849 
1850 	const VkBufferMemoryBarrier writeUniformConstantsBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
1851 
1852 	const VkBufferMemoryBarrier betweenShadersBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *workBuffer, 0ull, workBufferSizeBytes);
1853 
1854 	const VkBufferMemoryBarrier afterComputeBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, outputBufferSizeBytes);
1855 
1856 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1857 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1858 
1859 	// Start recording commands
1860 
1861 	beginCommandBuffer(vk, *cmdBuffer);
1862 
1863 	vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline0);
1864 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1865 
1866 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &writeUniformConstantsBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1867 
1868 	vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1869 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &betweenShadersBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1870 
1871 	// Switch to the second shader program
1872 	vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline1);
1873 
1874 	vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1875 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &afterComputeBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1876 
1877 	endCommandBuffer(vk, *cmdBuffer);
1878 
1879 	// Wait for completion
1880 
1881 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1882 
1883 	// Validate the results
1884 
1885 	const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1886 	invalidateAlloc(vk, device, outputBufferAllocation);
1887 
1888 	const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
1889 	const deUint32	res = *bufferPtr;
1890 	deUint32		ref = 0;
1891 
1892 	for (int ndx = 0; ndx < workGroupCount; ++ndx)
1893 		ref += baseValue + ndx;
1894 
1895 	if (res != ref)
1896 	{
1897 		std::ostringstream msg;
1898 		msg << "ERROR: comparison failed, expected " << ref << ", got " << res;
1899 		return tcu::TestStatus::fail(msg.str());
1900 	}
1901 	return tcu::TestStatus::pass("Compute succeeded");
1902 }
1903 
1904 class ImageAtomicOpTest : public vkt::TestCase
1905 {
1906 public:
1907 						ImageAtomicOpTest		(tcu::TestContext&	testCtx,
1908 												 const std::string& name,
1909 												 const std::string& description,
1910 												 const deUint32		localSize,
1911 												 const tcu::IVec2&	imageSize);
1912 
1913 	void				initPrograms			(SourceCollections& sourceCollections) const;
1914 	TestInstance*		createInstance			(Context&			context) const;
1915 
1916 private:
1917 	const deUint32		m_localSize;
1918 	const tcu::IVec2	m_imageSize;
1919 };
1920 
1921 class ImageAtomicOpTestInstance : public vkt::TestInstance
1922 {
1923 public:
1924 									ImageAtomicOpTestInstance		(Context&			context,
1925 																	 const deUint32		localSize,
1926 																	 const tcu::IVec2&	imageSize);
1927 
1928 	tcu::TestStatus					iterate							(void);
1929 
1930 private:
1931 	const deUint32					m_localSize;
1932 	const tcu::IVec2				m_imageSize;
1933 };
1934 
ImageAtomicOpTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 localSize,const tcu::IVec2 & imageSize)1935 ImageAtomicOpTest::ImageAtomicOpTest (tcu::TestContext&		testCtx,
1936 									  const std::string&	name,
1937 									  const std::string&	description,
1938 									  const deUint32		localSize,
1939 									  const tcu::IVec2&		imageSize)
1940 	: TestCase		(testCtx, name, description)
1941 	, m_localSize	(localSize)
1942 	, m_imageSize	(imageSize)
1943 {
1944 }
1945 
initPrograms(SourceCollections & sourceCollections) const1946 void ImageAtomicOpTest::initPrograms (SourceCollections& sourceCollections) const
1947 {
1948 	std::ostringstream src;
1949 	src << "#version 310 es\n"
1950 		<< "#extension GL_OES_shader_image_atomic : require\n"
1951 		<< "layout (local_size_x = " << m_localSize << ") in;\n"
1952 		<< "layout(binding = 1, r32ui) coherent uniform highp uimage2D u_dstImg;\n"
1953 		<< "layout(binding = 0) readonly buffer Input {\n"
1954 		<< "    uint values[" << (multiplyComponents(m_imageSize) * m_localSize) << "];\n"
1955 		<< "} sb_in;\n\n"
1956 		<< "void main (void) {\n"
1957 		<< "    uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
1958 		<< "    uint value  = sb_in.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x];\n"
1959 		<< "\n"
1960 		<< "    if (gl_LocalInvocationIndex == 0u)\n"
1961 		<< "        imageStore(u_dstImg, ivec2(gl_WorkGroupID.xy), uvec4(0));\n"
1962 		<< "    memoryBarrierImage();\n"
1963 		<< "    barrier();\n"
1964 		<< "    imageAtomicAdd(u_dstImg, ivec2(gl_WorkGroupID.xy), value);\n"
1965 		<< "}\n";
1966 
1967 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1968 }
1969 
createInstance(Context & context) const1970 TestInstance* ImageAtomicOpTest::createInstance (Context& context) const
1971 {
1972 	return new ImageAtomicOpTestInstance(context, m_localSize, m_imageSize);
1973 }
1974 
ImageAtomicOpTestInstance(Context & context,const deUint32 localSize,const tcu::IVec2 & imageSize)1975 ImageAtomicOpTestInstance::ImageAtomicOpTestInstance (Context& context, const deUint32 localSize, const tcu::IVec2& imageSize)
1976 	: TestInstance	(context)
1977 	, m_localSize	(localSize)
1978 	, m_imageSize	(imageSize)
1979 {
1980 }
1981 
iterate(void)1982 tcu::TestStatus ImageAtomicOpTestInstance::iterate (void)
1983 {
1984 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
1985 	const VkDevice			device				= m_context.getDevice();
1986 	const VkQueue			queue				= m_context.getUniversalQueue();
1987 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
1988 	Allocator&				allocator			= m_context.getDefaultAllocator();
1989 
1990 	// Create an image
1991 
1992 	const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
1993 	const Image image(vk, device, allocator, imageParams, MemoryRequirement::Any);
1994 
1995 	const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
1996 	const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
1997 
1998 	// Input buffer
1999 
2000 	const deUint32 numInputValues = multiplyComponents(m_imageSize) * m_localSize;
2001 	const VkDeviceSize inputBufferSizeBytes = sizeof(deUint32) * numInputValues;
2002 
2003 	const Buffer inputBuffer(vk, device, allocator, makeBufferCreateInfo(inputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
2004 
2005 	// Populate the input buffer with test data
2006 	{
2007 		de::Random rnd(0x77238ac2);
2008 		const Allocation& inputBufferAllocation = inputBuffer.getAllocation();
2009 		deUint32* bufferPtr = static_cast<deUint32*>(inputBufferAllocation.getHostPtr());
2010 		for (deUint32 i = 0; i < numInputValues; ++i)
2011 			*bufferPtr++ = rnd.getUint32();
2012 
2013 		flushAlloc(vk, device, inputBufferAllocation);
2014 	}
2015 
2016 	// Create a buffer to store shader output (copied from image data)
2017 
2018 	const deUint32 imageArea = multiplyComponents(m_imageSize);
2019 	const VkDeviceSize outputBufferSizeBytes = sizeof(deUint32) * imageArea;
2020 	const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT), MemoryRequirement::HostVisible);
2021 
2022 	// Create descriptor set
2023 
2024 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2025 		DescriptorSetLayoutBuilder()
2026 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2027 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
2028 		.build(vk, device));
2029 
2030 	const Unique<VkDescriptorPool> descriptorPool(
2031 		DescriptorPoolBuilder()
2032 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2033 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
2034 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2035 
2036 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2037 
2038 	// Set the bindings
2039 
2040 	const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
2041 	const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, inputBufferSizeBytes);
2042 
2043 	DescriptorSetUpdateBuilder()
2044 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2045 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
2046 		.update(vk, device);
2047 
2048 	// Perform the computation
2049 	{
2050 		const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
2051 		const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2052 		const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
2053 
2054 		const VkBufferMemoryBarrier inputBufferPostHostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, inputBufferSizeBytes);
2055 
2056 		const VkImageMemoryBarrier imageLayoutBarrier = makeImageMemoryBarrier(
2057 			(VkAccessFlags)0, VK_ACCESS_SHADER_WRITE_BIT,
2058 			VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
2059 			*image, subresourceRange);
2060 
2061 		// Prepare the command buffer
2062 
2063 		const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
2064 		const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2065 
2066 		// Start recording commands
2067 
2068 		beginCommandBuffer(vk, *cmdBuffer);
2069 
2070 		vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
2071 		vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2072 
2073 		vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &inputBufferPostHostWriteBarrier, 1, &imageLayoutBarrier);
2074 		vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2075 
2076 		copyImageToBuffer(vk, *cmdBuffer, *image, *outputBuffer, m_imageSize, VK_ACCESS_SHADER_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL);
2077 
2078 		endCommandBuffer(vk, *cmdBuffer);
2079 
2080 		// Wait for completion
2081 
2082 		submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2083 	}
2084 
2085 	// Validate the results
2086 
2087 	const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
2088 	invalidateAlloc(vk, device, outputBufferAllocation);
2089 
2090 	const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
2091 	const deUint32* refBufferPtr = static_cast<deUint32*>(inputBuffer.getAllocation().getHostPtr());
2092 
2093 	for (deUint32 pixelNdx = 0; pixelNdx < imageArea; ++pixelNdx)
2094 	{
2095 		const deUint32	res = bufferPtr[pixelNdx];
2096 		deUint32		ref = 0;
2097 
2098 		for (deUint32 offs = 0; offs < m_localSize; ++offs)
2099 			ref += refBufferPtr[pixelNdx * m_localSize + offs];
2100 
2101 		if (res != ref)
2102 		{
2103 			std::ostringstream msg;
2104 			msg << "Comparison failed for pixel " << pixelNdx;
2105 			return tcu::TestStatus::fail(msg.str());
2106 		}
2107 	}
2108 	return tcu::TestStatus::pass("Compute succeeded");
2109 }
2110 
2111 class ImageBarrierTest : public vkt::TestCase
2112 {
2113 public:
2114 						ImageBarrierTest	(tcu::TestContext&	testCtx,
2115 											const std::string&	name,
2116 											const std::string&	description,
2117 											const tcu::IVec2&	imageSize);
2118 
2119 	void				initPrograms		(SourceCollections& sourceCollections) const;
2120 	TestInstance*		createInstance		(Context&			context) const;
2121 
2122 private:
2123 	const tcu::IVec2	m_imageSize;
2124 };
2125 
2126 class ImageBarrierTestInstance : public vkt::TestInstance
2127 {
2128 public:
2129 									ImageBarrierTestInstance	(Context&			context,
2130 																 const tcu::IVec2&	imageSize);
2131 
2132 	tcu::TestStatus					iterate						(void);
2133 
2134 private:
2135 	const tcu::IVec2				m_imageSize;
2136 };
2137 
ImageBarrierTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec2 & imageSize)2138 ImageBarrierTest::ImageBarrierTest (tcu::TestContext&	testCtx,
2139 									const std::string&	name,
2140 									const std::string&	description,
2141 									const tcu::IVec2&	imageSize)
2142 	: TestCase		(testCtx, name, description)
2143 	, m_imageSize	(imageSize)
2144 {
2145 }
2146 
initPrograms(SourceCollections & sourceCollections) const2147 void ImageBarrierTest::initPrograms (SourceCollections& sourceCollections) const
2148 {
2149 	sourceCollections.glslSources.add("comp0") << glu::ComputeSource(
2150 		"#version 310 es\n"
2151 		"layout (local_size_x = 1) in;\n"
2152 		"layout(binding = 2) readonly uniform Constants {\n"
2153 		"    uint u_baseVal;\n"
2154 		"};\n"
2155 		"layout(binding = 1, r32ui) writeonly uniform highp uimage2D u_img;\n"
2156 		"void main (void) {\n"
2157 		"    uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
2158 		"    imageStore(u_img, ivec2(gl_WorkGroupID.xy), uvec4(offset + u_baseVal, 0, 0, 0));\n"
2159 		"}\n");
2160 
2161 	sourceCollections.glslSources.add("comp1") << glu::ComputeSource(
2162 		"#version 310 es\n"
2163 		"layout (local_size_x = 1) in;\n"
2164 		"layout(binding = 1, r32ui) readonly uniform highp uimage2D u_img;\n"
2165 		"layout(binding = 0) coherent buffer Output {\n"
2166 		"    uint sum;\n"
2167 		"};\n"
2168 		"void main (void) {\n"
2169 		"    uint value = imageLoad(u_img, ivec2(gl_WorkGroupID.xy)).x;\n"
2170 		"    atomicAdd(sum, value);\n"
2171 		"}\n");
2172 }
2173 
createInstance(Context & context) const2174 TestInstance* ImageBarrierTest::createInstance (Context& context) const
2175 {
2176 	return new ImageBarrierTestInstance(context, m_imageSize);
2177 }
2178 
ImageBarrierTestInstance(Context & context,const tcu::IVec2 & imageSize)2179 ImageBarrierTestInstance::ImageBarrierTestInstance (Context& context, const tcu::IVec2& imageSize)
2180 	: TestInstance	(context)
2181 	, m_imageSize	(imageSize)
2182 {
2183 }
2184 
iterate(void)2185 tcu::TestStatus ImageBarrierTestInstance::iterate (void)
2186 {
2187 	const DeviceInterface&	vk					= m_context.getDeviceInterface();
2188 	const VkDevice			device				= m_context.getDevice();
2189 	const VkQueue			queue				= m_context.getUniversalQueue();
2190 	const deUint32			queueFamilyIndex	= m_context.getUniversalQueueFamilyIndex();
2191 	Allocator&				allocator			= m_context.getDefaultAllocator();
2192 
2193 	// Create an image used by both shaders
2194 
2195 	const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_STORAGE_BIT);
2196 	const Image image(vk, device, allocator, imageParams, MemoryRequirement::Any);
2197 
2198 	const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
2199 	const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
2200 
2201 	// Create an output buffer
2202 
2203 	const VkDeviceSize outputBufferSizeBytes = sizeof(deUint32);
2204 	const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
2205 
2206 	// Initialize atomic counter value to zero
2207 	{
2208 		const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
2209 		deUint32* outputBufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
2210 		*outputBufferPtr = 0;
2211 		flushAlloc(vk, device, outputBufferAllocation);
2212 	}
2213 
2214 	// Create a uniform buffer (to pass uniform constants)
2215 
2216 	const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32);
2217 	const Buffer uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
2218 
2219 	// Set the constants in the uniform buffer
2220 
2221 	const deUint32	baseValue = 127;
2222 	{
2223 		const Allocation& uniformBufferAllocation = uniformBuffer.getAllocation();
2224 		deUint32* uniformBufferPtr = static_cast<deUint32*>(uniformBufferAllocation.getHostPtr());
2225 		uniformBufferPtr[0] = baseValue;
2226 
2227 		flushAlloc(vk, device, uniformBufferAllocation);
2228 	}
2229 
2230 	// Create descriptor set
2231 
2232 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2233 		DescriptorSetLayoutBuilder()
2234 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2235 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
2236 		.addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2237 		.build(vk, device));
2238 
2239 	const Unique<VkDescriptorPool> descriptorPool(
2240 		DescriptorPoolBuilder()
2241 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2242 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
2243 		.addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2244 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2245 
2246 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2247 
2248 	const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
2249 	const VkDescriptorBufferInfo outputBufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, outputBufferSizeBytes);
2250 	const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2251 	DescriptorSetUpdateBuilder()
2252 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
2253 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
2254 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(2u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2255 		.update(vk, device);
2256 
2257 	// Perform the computation
2258 
2259 	const Unique<VkShaderModule>	shaderModule0(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp0"), 0));
2260 	const Unique<VkShaderModule>	shaderModule1(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp1"), 0));
2261 
2262 	const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2263 	const Unique<VkPipeline> pipeline0(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule0));
2264 	const Unique<VkPipeline> pipeline1(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule1));
2265 
2266 	const VkBufferMemoryBarrier writeUniformConstantsBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2267 
2268 	const VkImageMemoryBarrier imageLayoutBarrier = makeImageMemoryBarrier(
2269 		0u, 0u,
2270 		VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
2271 		*image, subresourceRange);
2272 
2273 	const VkImageMemoryBarrier imageBarrierBetweenShaders = makeImageMemoryBarrier(
2274 		VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT,
2275 		VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
2276 		*image, subresourceRange);
2277 
2278 	const VkBufferMemoryBarrier afterComputeBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, outputBufferSizeBytes);
2279 
2280 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
2281 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2282 
2283 	// Start recording commands
2284 
2285 	beginCommandBuffer(vk, *cmdBuffer);
2286 
2287 	vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline0);
2288 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2289 
2290 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &writeUniformConstantsBarrier, 1, &imageLayoutBarrier);
2291 
2292 	vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2293 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 0, (const VkBufferMemoryBarrier*)DE_NULL, 1, &imageBarrierBetweenShaders);
2294 
2295 	// Switch to the second shader program
2296 	vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline1);
2297 
2298 	vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2299 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &afterComputeBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2300 
2301 	endCommandBuffer(vk, *cmdBuffer);
2302 
2303 	// Wait for completion
2304 
2305 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2306 
2307 	// Validate the results
2308 
2309 	const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
2310 	invalidateAlloc(vk, device, outputBufferAllocation);
2311 
2312 	const int		numValues = multiplyComponents(m_imageSize);
2313 	const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
2314 	const deUint32	res = *bufferPtr;
2315 	deUint32		ref = 0;
2316 
2317 	for (int ndx = 0; ndx < numValues; ++ndx)
2318 		ref += baseValue + ndx;
2319 
2320 	if (res != ref)
2321 	{
2322 		std::ostringstream msg;
2323 		msg << "ERROR: comparison failed, expected " << ref << ", got " << res;
2324 		return tcu::TestStatus::fail(msg.str());
2325 	}
2326 	return tcu::TestStatus::pass("Compute succeeded");
2327 }
2328 
2329 class ComputeTestInstance : public vkt::TestInstance
2330 {
2331 public:
ComputeTestInstance(Context & context)2332 		ComputeTestInstance		(Context& context)
2333 		: TestInstance			(context)
2334 		, m_numPhysDevices		(1)
2335 		, m_queueFamilyIndex	(0)
2336 	{
2337 		createDeviceGroup();
2338 	}
2339 
2340 	void							createDeviceGroup	(void);
getDeviceInterface(void)2341 	const vk::DeviceInterface&		getDeviceInterface	(void)			{ return *m_deviceDriver; }
getInstance(void)2342 	vk::VkInstance					getInstance			(void)			{ return m_deviceGroupInstance; }
getDevice(void)2343 	vk::VkDevice					getDevice			(void)			{ return *m_logicalDevice; }
getPhysicalDevice(deUint32 i=0)2344 	vk::VkPhysicalDevice			getPhysicalDevice	(deUint32 i = 0){ return m_physicalDevices[i]; }
2345 
2346 protected:
2347 	deUint32						m_numPhysDevices;
2348 	deUint32						m_queueFamilyIndex;
2349 
2350 private:
2351 	CustomInstance						m_deviceGroupInstance;
2352 	vk::Move<vk::VkDevice>				m_logicalDevice;
2353 	std::vector<vk::VkPhysicalDevice>	m_physicalDevices;
2354 	de::MovePtr<vk::DeviceDriver>		m_deviceDriver;
2355 };
2356 
createDeviceGroup(void)2357 void ComputeTestInstance::createDeviceGroup (void)
2358 {
2359 	const tcu::CommandLine&							cmdLine					= m_context.getTestContext().getCommandLine();
2360 	const deUint32									devGroupIdx				= cmdLine.getVKDeviceGroupId() - 1;
2361 	const deUint32									physDeviceIdx			= cmdLine.getVKDeviceId() - 1;
2362 	const float										queuePriority			= 1.0f;
2363 	const std::vector<std::string>					requiredExtensions		(1, "VK_KHR_device_group_creation");
2364 	m_deviceGroupInstance													= createCustomInstanceWithExtensions(m_context, requiredExtensions);
2365 	std::vector<VkPhysicalDeviceGroupProperties>	devGroupProperties		= enumeratePhysicalDeviceGroups(m_context.getInstanceInterface(), m_deviceGroupInstance);
2366 	m_numPhysDevices														= devGroupProperties[devGroupIdx].physicalDeviceCount;
2367 	std::vector<const char*>						deviceExtensions;
2368 
2369 	if (!isCoreDeviceExtension(m_context.getUsedApiVersion(), "VK_KHR_device_group"))
2370 		deviceExtensions.push_back("VK_KHR_device_group");
2371 
2372 	VkDeviceGroupDeviceCreateInfo					deviceGroupInfo			=
2373 	{
2374 		VK_STRUCTURE_TYPE_DEVICE_GROUP_DEVICE_CREATE_INFO_KHR,								//stype
2375 		DE_NULL,																			//pNext
2376 		devGroupProperties[devGroupIdx].physicalDeviceCount,								//physicalDeviceCount
2377 		devGroupProperties[devGroupIdx].physicalDevices										//physicalDevices
2378 	};
2379 	const InstanceDriver&							instance				(m_deviceGroupInstance.getDriver());
2380 	const VkPhysicalDeviceFeatures					deviceFeatures			= getPhysicalDeviceFeatures(instance, deviceGroupInfo.pPhysicalDevices[physDeviceIdx]);
2381 	const std::vector<VkQueueFamilyProperties>		queueProps				= getPhysicalDeviceQueueFamilyProperties(instance, devGroupProperties[devGroupIdx].physicalDevices[physDeviceIdx]);
2382 
2383 	m_physicalDevices.resize(m_numPhysDevices);
2384 	for (deUint32 physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2385 		m_physicalDevices[physDevIdx] = devGroupProperties[devGroupIdx].physicalDevices[physDevIdx];
2386 
2387 	for (size_t queueNdx = 0; queueNdx < queueProps.size(); queueNdx++)
2388 	{
2389 		if (queueProps[queueNdx].queueFlags & VK_QUEUE_COMPUTE_BIT)
2390 			m_queueFamilyIndex = (deUint32)queueNdx;
2391 	}
2392 
2393 	VkDeviceQueueCreateInfo							queueInfo				=
2394 	{
2395 		VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,		// VkStructureType					sType;
2396 		DE_NULL,										// const void*						pNext;
2397 		(VkDeviceQueueCreateFlags)0u,					// VkDeviceQueueCreateFlags			flags;
2398 		m_queueFamilyIndex,								// deUint32							queueFamilyIndex;
2399 		1u,												// deUint32							queueCount;
2400 		&queuePriority									// const float*						pQueuePriorities;
2401 	};
2402 
2403 	const VkDeviceCreateInfo						deviceInfo				=
2404 	{
2405 		VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,							// VkStructureType					sType;
2406 		&deviceGroupInfo,												// const void*						pNext;
2407 		(VkDeviceCreateFlags)0,											// VkDeviceCreateFlags				flags;
2408 		1u	,															// uint32_t							queueCreateInfoCount;
2409 		&queueInfo,														// const VkDeviceQueueCreateInfo*	pQueueCreateInfos;
2410 		0u,																// uint32_t							enabledLayerCount;
2411 		DE_NULL,														// const char* const*				ppEnabledLayerNames;
2412 		deUint32(deviceExtensions.size()),								// uint32_t							enabledExtensionCount;
2413 		(deviceExtensions.empty() ? DE_NULL : &deviceExtensions[0]),	// const char* const*				ppEnabledExtensionNames;
2414 		&deviceFeatures,												// const VkPhysicalDeviceFeatures*	pEnabledFeatures;
2415 	};
2416 
2417 	m_logicalDevice		= createCustomDevice(m_context.getTestContext().getCommandLine().isValidationEnabled(), m_context.getPlatformInterface(), m_deviceGroupInstance, instance, deviceGroupInfo.pPhysicalDevices[physDeviceIdx], &deviceInfo);
2418 	m_deviceDriver		= de::MovePtr<DeviceDriver>(new DeviceDriver(m_context.getPlatformInterface(), m_deviceGroupInstance, *m_logicalDevice));
2419 }
2420 
2421 class DispatchBaseTest : public vkt::TestCase
2422 {
2423 public:
2424 						DispatchBaseTest	(tcu::TestContext&	testCtx,
2425 											const std::string&	name,
2426 											const std::string&	description,
2427 											const deUint32		numValues,
2428 											const tcu::IVec3&	localsize,
2429 											const tcu::IVec3&	worksize,
2430 											const tcu::IVec3&	splitsize);
2431 
2432 	void				initPrograms		(SourceCollections& sourceCollections) const;
2433 	TestInstance*		createInstance		(Context&			context) const;
2434 
2435 private:
2436 	const deUint32					m_numValues;
2437 	const tcu::IVec3				m_localSize;
2438 	const tcu::IVec3				m_workSize;
2439 	const tcu::IVec3				m_splitSize;
2440 };
2441 
2442 class DispatchBaseTestInstance : public ComputeTestInstance
2443 {
2444 public:
2445 									DispatchBaseTestInstance	(Context&			context,
2446 																const deUint32		numValues,
2447 																const tcu::IVec3&	localsize,
2448 																const tcu::IVec3&	worksize,
2449 																const tcu::IVec3&	splitsize);
2450 
2451 	bool							isInputVectorValid			(const tcu::IVec3& small, const tcu::IVec3& big);
2452 	tcu::TestStatus					iterate						(void);
2453 
2454 private:
2455 	const deUint32					m_numValues;
2456 	const tcu::IVec3				m_localSize;
2457 	const tcu::IVec3				m_workSize;
2458 	const tcu::IVec3				m_splitWorkSize;
2459 };
2460 
DispatchBaseTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize,const tcu::IVec3 & splitsize)2461 DispatchBaseTest::DispatchBaseTest (tcu::TestContext&	testCtx,
2462 									const std::string&	name,
2463 									const std::string&	description,
2464 									const deUint32		numValues,
2465 									const tcu::IVec3&	localsize,
2466 									const tcu::IVec3&	worksize,
2467 									const tcu::IVec3&	splitsize)
2468 	: TestCase		(testCtx, name, description)
2469 	, m_numValues	(numValues)
2470 	, m_localSize	(localsize)
2471 	, m_workSize	(worksize)
2472 	, m_splitSize	(splitsize)
2473 {
2474 }
2475 
initPrograms(SourceCollections & sourceCollections) const2476 void DispatchBaseTest::initPrograms (SourceCollections& sourceCollections) const
2477 {
2478 	std::ostringstream src;
2479 	src << "#version 310 es\n"
2480 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
2481 
2482 		<< "layout(binding = 0) buffer InOut {\n"
2483 		<< "    uint values[" << de::toString(m_numValues) << "];\n"
2484 		<< "} sb_inout;\n"
2485 
2486 		<< "layout(binding = 1) readonly uniform uniformInput {\n"
2487 		<< "    uvec3 gridSize;\n"
2488 		<< "} ubo_in;\n"
2489 
2490 		<< "void main (void) {\n"
2491 		<< "    uvec3 size = ubo_in.gridSize * gl_WorkGroupSize;\n"
2492 		<< "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
2493 		<< "    uint index = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
2494 		<< "    uint offset = numValuesPerInv*index;\n"
2495 		<< "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
2496 		<< "        sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
2497 		<< "}\n";
2498 
2499 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
2500 }
2501 
createInstance(Context & context) const2502 TestInstance* DispatchBaseTest::createInstance (Context& context) const
2503 {
2504 	return new DispatchBaseTestInstance(context, m_numValues, m_localSize, m_workSize, m_splitSize);
2505 }
2506 
DispatchBaseTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize,const tcu::IVec3 & splitsize)2507 DispatchBaseTestInstance::DispatchBaseTestInstance (Context& context,
2508 													const deUint32		numValues,
2509 													const tcu::IVec3&	localsize,
2510 													const tcu::IVec3&	worksize,
2511 													const tcu::IVec3&	splitsize)
2512 
2513 	: ComputeTestInstance	(context)
2514 	, m_numValues			(numValues)
2515 	, m_localSize			(localsize)
2516 	, m_workSize			(worksize)
2517 	, m_splitWorkSize		(splitsize)
2518 {
2519 	// For easy work distribution across physical devices:
2520 	// WorkSize should be a multiple of SplitWorkSize only in the X component
2521 	if ((!isInputVectorValid(m_splitWorkSize, m_workSize)) ||
2522 		(m_workSize.x() <= m_splitWorkSize.x()) ||
2523 		(m_workSize.y() != m_splitWorkSize.y()) ||
2524 		(m_workSize.z() != m_splitWorkSize.z()))
2525 		TCU_THROW(TestError, "Invalid Input.");
2526 
2527 	// For easy work distribution within the same physical device:
2528 	// SplitWorkSize should be a multiple of localSize in Y or Z component
2529 	if ((!isInputVectorValid(m_localSize, m_splitWorkSize)) ||
2530 		(m_localSize.x() != m_splitWorkSize.x()) ||
2531 		(m_localSize.y() >= m_splitWorkSize.y()) ||
2532 		(m_localSize.z() >= m_splitWorkSize.z()))
2533 		TCU_THROW(TestError, "Invalid Input.");
2534 
2535 	if ((multiplyComponents(m_workSize) / multiplyComponents(m_splitWorkSize)) < (deInt32) m_numPhysDevices)
2536 		TCU_THROW(TestError, "Not enough work to distribute across all physical devices.");
2537 
2538 	deUint32 totalWork = multiplyComponents(m_workSize) * multiplyComponents(m_localSize);
2539 	if ((totalWork > numValues) || (numValues % totalWork != 0))
2540 		TCU_THROW(TestError, "Buffer too small/not aligned to cover all values.");
2541 }
2542 
isInputVectorValid(const tcu::IVec3 & small,const tcu::IVec3 & big)2543 bool DispatchBaseTestInstance::isInputVectorValid(const tcu::IVec3& small, const tcu::IVec3& big)
2544 {
2545 	if (((big.x() < small.x()) || (big.y() < small.y()) || (big.z() < small.z())) ||
2546 		((big.x() % small.x() != 0) || (big.y() % small.y() != 0) || (big.z() % small.z() != 0)))
2547 		return false;
2548 	return true;
2549 }
2550 
iterate(void)2551 tcu::TestStatus DispatchBaseTestInstance::iterate (void)
2552 {
2553 	const DeviceInterface&	vk					= getDeviceInterface();
2554 	const VkDevice			device				= getDevice();
2555 	const VkQueue			queue				= getDeviceQueue(vk, device, m_queueFamilyIndex, 0);
2556 	SimpleAllocator			allocator			(vk, device, getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice()));
2557 	deUint32				totalWorkloadSize	= 0;
2558 
2559 	// Create an uniform and input/output buffer
2560 	const deUint32 uniformBufSize = 3; // Pass the compute grid size
2561 	const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32) * uniformBufSize;
2562 	const Buffer uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
2563 
2564 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
2565 	const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
2566 
2567 	// Fill the buffers with data
2568 	typedef std::vector<deUint32> data_vector_t;
2569 	data_vector_t uniformInputData(uniformBufSize);
2570 	data_vector_t inputData(m_numValues);
2571 
2572 	{
2573 		const Allocation& bufferAllocation = uniformBuffer.getAllocation();
2574 		deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2575 		uniformInputData[0] = *bufferPtr++ = m_workSize.x();
2576 		uniformInputData[1] = *bufferPtr++ = m_workSize.y();
2577 		uniformInputData[2] = *bufferPtr++ = m_workSize.z();
2578 		flushAlloc(vk, device, bufferAllocation);
2579 	}
2580 
2581 	{
2582 		de::Random rnd(0x82ce7f);
2583 		const Allocation& bufferAllocation = buffer.getAllocation();
2584 		deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2585 		for (deUint32 i = 0; i < m_numValues; ++i)
2586 			inputData[i] = *bufferPtr++ = rnd.getUint32();
2587 
2588 		flushAlloc(vk, device, bufferAllocation);
2589 	}
2590 
2591 	// Create descriptor set
2592 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2593 		DescriptorSetLayoutBuilder()
2594 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2595 		.addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2596 		.build(vk, device));
2597 
2598 	const Unique<VkDescriptorPool> descriptorPool(
2599 		DescriptorPoolBuilder()
2600 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2601 		.addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2602 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2603 
2604 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2605 
2606 	const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
2607 	const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2608 
2609 	DescriptorSetUpdateBuilder()
2610 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2611 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2612 		.update(vk, device);
2613 
2614 	const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
2615 	const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2616 	const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, static_cast<VkPipelineCreateFlags>(VK_PIPELINE_CREATE_DISPATCH_BASE), *shaderModule, static_cast<VkPipelineShaderStageCreateFlags>(0u)));
2617 
2618 	const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer, 0ull, bufferSizeBytes);
2619 	const VkBufferMemoryBarrier hostUniformWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2620 
2621 	const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
2622 
2623 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, m_queueFamilyIndex));
2624 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2625 
2626 	// Start recording commands
2627 	beginCommandBuffer(vk, *cmdBuffer);
2628 
2629 	vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
2630 	vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2631 
2632 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostUniformWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2633 
2634 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2635 
2636 	// Split the workload across all physical devices based on m_splitWorkSize.x()
2637 	for (deUint32 physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2638 	{
2639 		deUint32 baseGroupX = physDevIdx * m_splitWorkSize.x();
2640 		deUint32 baseGroupY = 0;
2641 		deUint32 baseGroupZ = 0;
2642 
2643 		// Split the workload within the physical device based on m_localSize.y() and m_localSize.z()
2644 		for (deInt32 localIdxY = 0; localIdxY < (m_splitWorkSize.y() / m_localSize.y()); localIdxY++)
2645 		{
2646 			for (deInt32 localIdxZ = 0; localIdxZ < (m_splitWorkSize.z() / m_localSize.z()); localIdxZ++)
2647 			{
2648 				deUint32 offsetX = baseGroupX;
2649 				deUint32 offsetY = baseGroupY + localIdxY * m_localSize.y();
2650 				deUint32 offsetZ = baseGroupZ + localIdxZ * m_localSize.z();
2651 
2652 				deUint32 localSizeX = (physDevIdx == (m_numPhysDevices - 1)) ? m_workSize.x() - baseGroupX : m_localSize.x();
2653 				deUint32 localSizeY = m_localSize.y();
2654 				deUint32 localSizeZ = m_localSize.z();
2655 
2656 				totalWorkloadSize += (localSizeX * localSizeY * localSizeZ);
2657 				vk.cmdDispatchBase(*cmdBuffer, offsetX, offsetY, offsetZ, localSizeX, localSizeY, localSizeZ);
2658 			}
2659 		}
2660 	}
2661 
2662 	vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2663 
2664 	endCommandBuffer(vk, *cmdBuffer);
2665 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2666 
2667 	if (totalWorkloadSize != deUint32(multiplyComponents(m_workSize)))
2668 		TCU_THROW(TestError, "Not covering the entire workload.");
2669 
2670 	// Validate the results
2671 	const Allocation& bufferAllocation = buffer.getAllocation();
2672 	invalidateAlloc(vk, device, bufferAllocation);
2673 	const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2674 
2675 	for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
2676 	{
2677 		const deUint32 res = bufferPtr[ndx];
2678 		const deUint32 ref = ~inputData[ndx];
2679 
2680 		if (res != ref)
2681 		{
2682 			std::ostringstream msg;
2683 			msg << "Comparison failed for InOut.values[" << ndx << "]";
2684 			return tcu::TestStatus::fail(msg.str());
2685 		}
2686 	}
2687 	return tcu::TestStatus::pass("Compute succeeded");
2688 }
2689 
2690 class DeviceIndexTest : public vkt::TestCase
2691 {
2692 public:
2693 	DeviceIndexTest		(tcu::TestContext&	testCtx,
2694 											const std::string&	name,
2695 											const std::string&	description,
2696 											const deUint32		numValues,
2697 											const tcu::IVec3&	localsize,
2698 											const tcu::IVec3&	splitsize);
2699 
2700 	void				initPrograms		(SourceCollections& sourceCollections) const;
2701 	TestInstance*		createInstance		(Context&			context) const;
2702 
2703 private:
2704 	const deUint32					m_numValues;
2705 	const tcu::IVec3				m_localSize;
2706 	const tcu::IVec3				m_workSize;
2707 	const tcu::IVec3				m_splitSize;
2708 };
2709 
2710 class DeviceIndexTestInstance : public ComputeTestInstance
2711 {
2712 public:
2713 									DeviceIndexTestInstance	(Context&			context,
2714 																const deUint32		numValues,
2715 																const tcu::IVec3&	localsize,
2716 																const tcu::IVec3&	worksize);
2717 	tcu::TestStatus					iterate						(void);
2718 private:
2719 	const deUint32					m_numValues;
2720 	const tcu::IVec3				m_localSize;
2721 	tcu::IVec3						m_workSize;
2722 };
2723 
DeviceIndexTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize)2724 DeviceIndexTest::DeviceIndexTest (tcu::TestContext&	testCtx,
2725 									const std::string&	name,
2726 									const std::string&	description,
2727 									const deUint32		numValues,
2728 									const tcu::IVec3&	localsize,
2729 									const tcu::IVec3&	worksize)
2730 	: TestCase		(testCtx, name, description)
2731 	, m_numValues	(numValues)
2732 	, m_localSize	(localsize)
2733 	, m_workSize	(worksize)
2734 {
2735 }
2736 
initPrograms(SourceCollections & sourceCollections) const2737 void DeviceIndexTest::initPrograms (SourceCollections& sourceCollections) const
2738 {
2739 	std::ostringstream src;
2740 	src << "#version 310 es\n"
2741 		<< "#extension GL_EXT_device_group : require\n"
2742 		<< "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
2743 
2744 		<< "layout(binding = 0) buffer InOut {\n"
2745 		<< "    uint values[" << de::toString(m_numValues) << "];\n"
2746 		<< "} sb_inout;\n"
2747 
2748 		<< "layout(binding = 1) readonly uniform uniformInput {\n"
2749 		<< "    uint baseOffset[1+" << VK_MAX_DEVICE_GROUP_SIZE_KHR << "];\n"
2750 		<< "} ubo_in;\n"
2751 
2752 		<< "void main (void) {\n"
2753 		<< "    uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
2754 		<< "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
2755 		<< "    uint index = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
2756 		<< "    uint offset = numValuesPerInv*index;\n"
2757 		<< "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
2758 		<< "        sb_inout.values[offset + ndx] = ubo_in.baseOffset[0] + ubo_in.baseOffset[gl_DeviceIndex + 1];\n"
2759 		<< "}\n";
2760 
2761 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
2762 }
2763 
createInstance(Context & context) const2764 TestInstance* DeviceIndexTest::createInstance (Context& context) const
2765 {
2766 	return new DeviceIndexTestInstance(context, m_numValues, m_localSize, m_workSize);
2767 }
2768 
DeviceIndexTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize)2769 DeviceIndexTestInstance::DeviceIndexTestInstance (Context& context,
2770 													const deUint32		numValues,
2771 													const tcu::IVec3&	localsize,
2772 													const tcu::IVec3&	worksize)
2773 
2774 	: ComputeTestInstance	(context)
2775 	, m_numValues			(numValues)
2776 	, m_localSize			(localsize)
2777 	, m_workSize			(worksize)
2778 {}
2779 
iterate(void)2780 tcu::TestStatus DeviceIndexTestInstance::iterate (void)
2781 {
2782 	const DeviceInterface&			vk					= getDeviceInterface();
2783 	const VkDevice					device				= getDevice();
2784 	const VkQueue					queue				= getDeviceQueue(vk, device, m_queueFamilyIndex, 0);
2785 	SimpleAllocator					allocator			(vk, device, getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice()));
2786 	const deUint32					allocDeviceMask		= (1 << m_numPhysDevices) - 1;
2787 	de::Random						rnd					(0x82ce7f);
2788 	Move<VkBuffer>					sboBuffer;
2789 	vk::Move<vk::VkDeviceMemory>	sboBufferMemory;
2790 
2791 	// Create an uniform and output buffer
2792 	const deUint32 uniformBufSize = 4 * (1 + VK_MAX_DEVICE_GROUP_SIZE_KHR);
2793 	const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32) * uniformBufSize;
2794 	const Buffer uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
2795 
2796 	const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
2797 	const Buffer checkBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT), MemoryRequirement::HostVisible);
2798 
2799 	// create SBO buffer
2800 	{
2801 		const VkBufferCreateInfo	sboBufferParams =
2802 		{
2803 			VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,									// sType
2804 			DE_NULL,																// pNext
2805 			0u,																		// flags
2806 			(VkDeviceSize)bufferSizeBytes,											// size
2807 			VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT,	// usage
2808 			VK_SHARING_MODE_EXCLUSIVE,												// sharingMode
2809 			1u,																		// queueFamilyIndexCount
2810 			&m_queueFamilyIndex,														// pQueueFamilyIndices
2811 		};
2812 		sboBuffer = createBuffer(vk, device, &sboBufferParams);
2813 
2814 		VkMemoryRequirements memReqs = getBufferMemoryRequirements(vk, device, sboBuffer.get());
2815 		deUint32 memoryTypeNdx = 0;
2816 		const VkPhysicalDeviceMemoryProperties deviceMemProps = getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice());
2817 		for ( memoryTypeNdx = 0; memoryTypeNdx < deviceMemProps.memoryTypeCount; memoryTypeNdx++)
2818 		{
2819 			if ((memReqs.memoryTypeBits & (1u << memoryTypeNdx)) != 0 &&
2820 				(deviceMemProps.memoryTypes[memoryTypeNdx].propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) == VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)
2821 				break;
2822 		}
2823 		if (memoryTypeNdx == deviceMemProps.memoryTypeCount)
2824 			TCU_THROW(NotSupportedError, "No compatible memory type found");
2825 
2826 		const VkMemoryAllocateFlagsInfo allocDeviceMaskInfo =
2827 		{
2828 			VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO_KHR,	// sType
2829 			DE_NULL,											// pNext
2830 			VK_MEMORY_ALLOCATE_DEVICE_MASK_BIT,					// flags
2831 			allocDeviceMask,									// deviceMask
2832 		};
2833 
2834 		VkMemoryAllocateInfo		allocInfo =
2835 		{
2836 			VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,			// sType
2837 			&allocDeviceMaskInfo,							// pNext
2838 			memReqs.size,									// allocationSize
2839 			memoryTypeNdx,									// memoryTypeIndex
2840 		};
2841 
2842 		sboBufferMemory = allocateMemory(vk, device, &allocInfo);
2843 		VK_CHECK(vk.bindBufferMemory(device, *sboBuffer, sboBufferMemory.get(), 0));
2844 	}
2845 
2846 	// Fill the buffers with data
2847 	typedef std::vector<deUint32> data_vector_t;
2848 	data_vector_t uniformInputData(uniformBufSize, 0);
2849 
2850 	{
2851 		const Allocation& bufferAllocation = uniformBuffer.getAllocation();
2852 		deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2853 		for (deUint32 i = 0; i < uniformBufSize; ++i)
2854 			uniformInputData[i] = *bufferPtr++ = rnd.getUint32() / 10; // divide to prevent overflow in addition
2855 
2856 		flushAlloc(vk, device, bufferAllocation);
2857 	}
2858 
2859 	// Create descriptor set
2860 	const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2861 		DescriptorSetLayoutBuilder()
2862 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2863 		.addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2864 		.build(vk, device));
2865 
2866 	const Unique<VkDescriptorPool> descriptorPool(
2867 		DescriptorPoolBuilder()
2868 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2869 		.addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2870 		.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2871 
2872 	const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2873 
2874 	const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*sboBuffer, 0ull, bufferSizeBytes);
2875 	const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2876 
2877 	DescriptorSetUpdateBuilder()
2878 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2879 		.writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2880 		.update(vk, device);
2881 
2882 	const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
2883 	const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2884 	const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
2885 
2886 	const VkBufferMemoryBarrier hostUniformWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2887 	const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT , *sboBuffer, 0ull, bufferSizeBytes);
2888 
2889 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, m_queueFamilyIndex));
2890 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2891 
2892 	// Verify multiple device masks
2893 	for (deUint32 physDevMask = 1; physDevMask < (1u << m_numPhysDevices); physDevMask++)
2894 	{
2895 		deUint32 constantValPerLoop = 0;
2896 		{
2897 			const Allocation& bufferAllocation = uniformBuffer.getAllocation();
2898 			deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2899 			constantValPerLoop = *bufferPtr = rnd.getUint32() / 10;  // divide to prevent overflow in addition
2900 			flushAlloc(vk, device, bufferAllocation);
2901 		}
2902 		beginCommandBuffer(vk, *cmdBuffer);
2903 
2904 		vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
2905 		vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2906 		vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostUniformWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2907 
2908 		vk.cmdSetDeviceMask(*cmdBuffer, physDevMask);
2909 		vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
2910 
2911 		vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2912 
2913 		endCommandBuffer(vk, *cmdBuffer);
2914 		submitCommandsAndWait(vk, device, queue, *cmdBuffer, true, physDevMask);
2915 
2916 		// Validate the results on all physical devices where compute shader was launched
2917 		const VkBufferMemoryBarrier srcBufferBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT , *sboBuffer, 0ull, bufferSizeBytes);
2918 		const VkBufferMemoryBarrier dstBufferBarrier = makeBufferMemoryBarrier(VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *checkBuffer, 0ull, bufferSizeBytes);
2919 		const VkBufferCopy	copyParams =
2920 		{
2921 			(VkDeviceSize)0u,						// srcOffset
2922 			(VkDeviceSize)0u,						// dstOffset
2923 			bufferSizeBytes							// size
2924 		};
2925 
2926 		for (deUint32 physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2927 		{
2928 			if (!(1<<physDevIdx & physDevMask))
2929 				continue;
2930 
2931 			const deUint32 deviceMask = 1 << physDevIdx;
2932 
2933 			beginCommandBuffer(vk, *cmdBuffer);
2934 			vk.cmdSetDeviceMask(*cmdBuffer, deviceMask);
2935 			vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT , VK_PIPELINE_STAGE_TRANSFER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &srcBufferBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2936 			vk.cmdCopyBuffer(*cmdBuffer, *sboBuffer, *checkBuffer, 1, &copyParams);
2937 			vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &dstBufferBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2938 
2939 			endCommandBuffer(vk, *cmdBuffer);
2940 			submitCommandsAndWait(vk, device, queue, *cmdBuffer, true, deviceMask);
2941 
2942 			const Allocation& bufferAllocation = checkBuffer.getAllocation();
2943 			invalidateAlloc(vk, device, bufferAllocation);
2944 			const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2945 
2946 			for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
2947 			{
2948 				const deUint32 res = bufferPtr[ndx];
2949 				const deUint32 ref = constantValPerLoop + uniformInputData[4 * (physDevIdx + 1)];
2950 
2951 				if (res != ref)
2952 				{
2953 					std::ostringstream msg;
2954 					msg << "Comparison failed on physical device "<< getPhysicalDevice(physDevIdx) <<" ( deviceMask "<< deviceMask <<" ) for InOut.values[" << ndx << "]";
2955 					return tcu::TestStatus::fail(msg.str());
2956 				}
2957 			}
2958 		}
2959 	}
2960 
2961 	return tcu::TestStatus::pass("Compute succeeded");
2962 }
2963 
2964 class ConcurrentCompute : public vkt::TestCase
2965 {
2966 public:
2967 						ConcurrentCompute	(tcu::TestContext&	testCtx,
2968 											 const std::string&	name,
2969 											 const std::string&	description);
2970 
2971 
2972 	void				initPrograms		(SourceCollections& sourceCollections) const;
2973 	TestInstance*		createInstance		(Context&			context) const;
2974 };
2975 
2976 class ConcurrentComputeInstance : public vkt::TestInstance
2977 {
2978 public:
2979 									ConcurrentComputeInstance	(Context& context);
2980 
2981 	tcu::TestStatus					iterate						(void);
2982 };
2983 
ConcurrentCompute(tcu::TestContext & testCtx,const std::string & name,const std::string & description)2984 ConcurrentCompute::ConcurrentCompute (tcu::TestContext&	testCtx,
2985 									  const std::string&	name,
2986 									  const std::string&	description)
2987 	: TestCase		(testCtx, name, description)
2988 {
2989 }
2990 
initPrograms(SourceCollections & sourceCollections) const2991 void ConcurrentCompute::initPrograms (SourceCollections& sourceCollections) const
2992 {
2993 	std::ostringstream src;
2994 	src << "#version 310 es\n"
2995 		<< "layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;\n"
2996 		<< "layout(binding = 0) buffer InOut {\n"
2997 		<< "    uint values[1024];\n"
2998 		<< "} sb_inout;\n"
2999 		<< "void main (void) {\n"
3000 		<< "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
3001 		<< "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
3002 		<< "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
3003 		<< "    uint offset          = numValuesPerInv*groupNdx;\n"
3004 		<< "\n"
3005 		<< "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
3006 		<< "        sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
3007 		<< "}\n";
3008 
3009 	sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
3010 }
3011 
createInstance(Context & context) const3012 TestInstance* ConcurrentCompute::createInstance (Context& context) const
3013 {
3014 	return new ConcurrentComputeInstance(context);
3015 }
3016 
ConcurrentComputeInstance(Context & context)3017 ConcurrentComputeInstance::ConcurrentComputeInstance (Context& context)
3018 	: TestInstance	(context)
3019 {
3020 }
3021 
iterate(void)3022 tcu::TestStatus ConcurrentComputeInstance::iterate (void)
3023 {
3024 	enum {
3025 		NO_MATCH_FOUND	= ~((deUint32)0),
3026 		ERROR_NONE		= 0,
3027 		ERROR_WAIT		= 1,
3028 		ERROR_ORDER		= 2
3029 	};
3030 
3031 	struct Queues
3032 	{
3033 		VkQueue		queue;
3034 		deUint32	queueFamilyIndex;
3035 	};
3036 
3037 	const DeviceInterface&					vk							= m_context.getDeviceInterface();
3038 	const deUint32							numValues					= 1024;
3039 	const InstanceInterface&				instance					= m_context.getInstanceInterface();
3040 	const VkPhysicalDevice					physicalDevice				= m_context.getPhysicalDevice();
3041 	tcu::TestLog&							log							= m_context.getTestContext().getLog();
3042 	vk::Move<vk::VkDevice>					logicalDevice;
3043 	std::vector<VkQueueFamilyProperties>	queueFamilyProperties;
3044 	VkDeviceCreateInfo						deviceInfo;
3045 	VkPhysicalDeviceFeatures				deviceFeatures;
3046 	const float								queuePriorities[2]			= {1.0f, 0.0f};
3047 	VkDeviceQueueCreateInfo					queueInfos[2];
3048 	Queues									queues[2]					=
3049 																		{
3050 																			{DE_NULL, (deUint32)NO_MATCH_FOUND},
3051 																			{DE_NULL, (deUint32)NO_MATCH_FOUND}
3052 																		};
3053 
3054 	queueFamilyProperties = getPhysicalDeviceQueueFamilyProperties(instance, physicalDevice);
3055 
3056 	for (deUint32 queueNdx = 0; queueNdx < queueFamilyProperties.size(); ++queueNdx)
3057 	{
3058 		if (queueFamilyProperties[queueNdx].queueFlags & VK_QUEUE_COMPUTE_BIT)
3059 		{
3060 			if (NO_MATCH_FOUND == queues[0].queueFamilyIndex)
3061 				queues[0].queueFamilyIndex = queueNdx;
3062 
3063 			if (queues[0].queueFamilyIndex != queueNdx || queueFamilyProperties[queueNdx].queueCount > 1u)
3064 			{
3065 				queues[1].queueFamilyIndex = queueNdx;
3066 				break;
3067 			}
3068 		}
3069 	}
3070 
3071 	if (queues[0].queueFamilyIndex == NO_MATCH_FOUND || queues[1].queueFamilyIndex == NO_MATCH_FOUND)
3072 		TCU_THROW(NotSupportedError, "Queues couldn't be created");
3073 
3074 	for (int queueNdx = 0; queueNdx < 2; ++queueNdx)
3075 	{
3076 		VkDeviceQueueCreateInfo queueInfo;
3077 		deMemset(&queueInfo, 0, sizeof(queueInfo));
3078 
3079 		queueInfo.sType				= VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
3080 		queueInfo.pNext				= DE_NULL;
3081 		queueInfo.flags				= (VkDeviceQueueCreateFlags)0u;
3082 		queueInfo.queueFamilyIndex	= queues[queueNdx].queueFamilyIndex;
3083 		queueInfo.queueCount		= (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex) ? 2 : 1;
3084 		queueInfo.pQueuePriorities	= (queueInfo.queueCount == 2) ? queuePriorities : &queuePriorities[queueNdx];
3085 
3086 		queueInfos[queueNdx]		= queueInfo;
3087 
3088 		if (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex)
3089 			break;
3090 	}
3091 	deMemset(&deviceInfo, 0, sizeof(deviceInfo));
3092 	instance.getPhysicalDeviceFeatures(physicalDevice, &deviceFeatures);
3093 
3094 	deviceInfo.sType					= VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
3095 	deviceInfo.pNext					= DE_NULL;
3096 	deviceInfo.enabledExtensionCount	= 0u;
3097 	deviceInfo.ppEnabledExtensionNames	= DE_NULL;
3098 	deviceInfo.enabledLayerCount		= 0u;
3099 	deviceInfo.ppEnabledLayerNames		= DE_NULL;
3100 	deviceInfo.pEnabledFeatures			= &deviceFeatures;
3101 	deviceInfo.queueCreateInfoCount		= (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex) ? 1 : 2;
3102 	deviceInfo.pQueueCreateInfos		= queueInfos;
3103 
3104 	logicalDevice = createCustomDevice(m_context.getTestContext().getCommandLine().isValidationEnabled(), m_context.getPlatformInterface(), m_context.getInstance(), instance, physicalDevice, &deviceInfo);
3105 
3106 	for (deUint32 queueReqNdx = 0; queueReqNdx < 2; ++queueReqNdx)
3107 	{
3108 		if (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex)
3109 			vk.getDeviceQueue(*logicalDevice, queues[queueReqNdx].queueFamilyIndex, queueReqNdx, &queues[queueReqNdx].queue);
3110 		else
3111 			vk.getDeviceQueue(*logicalDevice, queues[queueReqNdx].queueFamilyIndex, 0u, &queues[queueReqNdx].queue);
3112 	}
3113 
3114 	// Create an input/output buffers
3115 	const VkPhysicalDeviceMemoryProperties memoryProperties	= vk::getPhysicalDeviceMemoryProperties(instance, physicalDevice);
3116 
3117 	SimpleAllocator *allocator								= new SimpleAllocator(vk, *logicalDevice, memoryProperties);
3118 	const VkDeviceSize bufferSizeBytes						= sizeof(deUint32) * numValues;
3119 	const Buffer buffer1(vk, *logicalDevice, *allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
3120 	const Buffer buffer2(vk, *logicalDevice, *allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
3121 
3122 	// Fill the buffers with data
3123 
3124 	typedef std::vector<deUint32> data_vector_t;
3125 	data_vector_t inputData(numValues);
3126 
3127 	{
3128 		de::Random rnd(0x82ce7f);
3129 		const Allocation& bufferAllocation1	= buffer1.getAllocation();
3130 		const Allocation& bufferAllocation2	= buffer2.getAllocation();
3131 		deUint32* bufferPtr1				= static_cast<deUint32*>(bufferAllocation1.getHostPtr());
3132 		deUint32* bufferPtr2				= static_cast<deUint32*>(bufferAllocation2.getHostPtr());
3133 
3134 		for (deUint32 i = 0; i < numValues; ++i)
3135 		{
3136 			deUint32 val = rnd.getUint32();
3137 			inputData[i] = val;
3138 			*bufferPtr1++ = val;
3139 			*bufferPtr2++ = val;
3140 		}
3141 
3142 		flushAlloc(vk, *logicalDevice, bufferAllocation1);
3143 		flushAlloc(vk, *logicalDevice, bufferAllocation2);
3144 	}
3145 
3146 	// Create descriptor sets
3147 
3148 	const Unique<VkDescriptorSetLayout>	descriptorSetLayout1(
3149 		DescriptorSetLayoutBuilder()
3150 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3151 		.build(vk, *logicalDevice));
3152 
3153 	const Unique<VkDescriptorPool>		descriptorPool1(
3154 		DescriptorPoolBuilder()
3155 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
3156 		.build(vk, *logicalDevice, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
3157 
3158 	const Unique<VkDescriptorSet>		descriptorSet1(makeDescriptorSet(vk, *logicalDevice, *descriptorPool1, *descriptorSetLayout1));
3159 
3160 	const VkDescriptorBufferInfo		bufferDescriptorInfo1	= makeDescriptorBufferInfo(*buffer1, 0ull, bufferSizeBytes);
3161 		DescriptorSetUpdateBuilder()
3162 		.writeSingle(*descriptorSet1, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo1)
3163 		.update(vk, *logicalDevice);
3164 
3165 	const Unique<VkDescriptorSetLayout>	descriptorSetLayout2(
3166 		DescriptorSetLayoutBuilder()
3167 		.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3168 		.build(vk, *logicalDevice));
3169 
3170 	const Unique<VkDescriptorPool>		descriptorPool2(
3171 		DescriptorPoolBuilder()
3172 		.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
3173 		.build(vk, *logicalDevice, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
3174 
3175 	const Unique<VkDescriptorSet>		descriptorSet2(makeDescriptorSet(vk, *logicalDevice, *descriptorPool2, *descriptorSetLayout2));
3176 
3177 	const VkDescriptorBufferInfo		bufferDescriptorInfo2	= makeDescriptorBufferInfo(*buffer2, 0ull, bufferSizeBytes);
3178 		DescriptorSetUpdateBuilder()
3179 		.writeSingle(*descriptorSet2, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo2)
3180 		.update(vk, *logicalDevice);
3181 
3182 	// Perform the computation
3183 
3184 	const Unique<VkShaderModule>		shaderModule(createShaderModule(vk, *logicalDevice, m_context.getBinaryCollection().get("comp"), 0u));
3185 
3186 	const Unique<VkPipelineLayout>		pipelineLayout1(makePipelineLayout(vk, *logicalDevice, *descriptorSetLayout1));
3187 	const Unique<VkPipeline>			pipeline1(makeComputePipeline(vk, *logicalDevice, *pipelineLayout1, *shaderModule));
3188 	const VkBufferMemoryBarrier			hostWriteBarrier1		= makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer1, 0ull, bufferSizeBytes);
3189 	const VkBufferMemoryBarrier			shaderWriteBarrier1		= makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer1, 0ull, bufferSizeBytes);
3190 	const Unique<VkCommandPool>			cmdPool1(makeCommandPool(vk, *logicalDevice, queues[0].queueFamilyIndex));
3191 	const Unique<VkCommandBuffer>		cmdBuffer1(allocateCommandBuffer(vk, *logicalDevice, *cmdPool1, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3192 
3193 	const Unique<VkPipelineLayout>		pipelineLayout2(makePipelineLayout(vk, *logicalDevice, *descriptorSetLayout2));
3194 	const Unique<VkPipeline>			pipeline2(makeComputePipeline(vk, *logicalDevice, *pipelineLayout2, *shaderModule));
3195 	const VkBufferMemoryBarrier			hostWriteBarrier2		= makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer2, 0ull, bufferSizeBytes);
3196 	const VkBufferMemoryBarrier			shaderWriteBarrier2		= makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer2, 0ull, bufferSizeBytes);
3197 	const Unique<VkCommandPool>			cmdPool2(makeCommandPool(vk, *logicalDevice, queues[1].queueFamilyIndex));
3198 	const Unique<VkCommandBuffer>		cmdBuffer2(allocateCommandBuffer(vk, *logicalDevice, *cmdPool2, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3199 
3200 	// Command buffer 1
3201 
3202 	beginCommandBuffer(vk, *cmdBuffer1);
3203 	vk.cmdBindPipeline(*cmdBuffer1, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline1);
3204 	vk.cmdBindDescriptorSets(*cmdBuffer1, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout1, 0u, 1u, &descriptorSet1.get(), 0u, DE_NULL);
3205 	vk.cmdPipelineBarrier(*cmdBuffer1, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier1, 0, (const VkImageMemoryBarrier*)DE_NULL);
3206 	vk.cmdDispatch(*cmdBuffer1, 1, 1, 1);
3207 	vk.cmdPipelineBarrier(*cmdBuffer1, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier1, 0, (const VkImageMemoryBarrier*)DE_NULL);
3208 	endCommandBuffer(vk, *cmdBuffer1);
3209 
3210 	// Command buffer 2
3211 
3212 	beginCommandBuffer(vk, *cmdBuffer2);
3213 	vk.cmdBindPipeline(*cmdBuffer2, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline2);
3214 	vk.cmdBindDescriptorSets(*cmdBuffer2, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout2, 0u, 1u, &descriptorSet2.get(), 0u, DE_NULL);
3215 	vk.cmdPipelineBarrier(*cmdBuffer2, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier2, 0, (const VkImageMemoryBarrier*)DE_NULL);
3216 	vk.cmdDispatch(*cmdBuffer2, 1, 1, 1);
3217 	vk.cmdPipelineBarrier(*cmdBuffer2, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier2, 0, (const VkImageMemoryBarrier*)DE_NULL);
3218 	endCommandBuffer(vk, *cmdBuffer2);
3219 
3220 	VkSubmitInfo	submitInfo1 =
3221 	{
3222 		VK_STRUCTURE_TYPE_SUBMIT_INFO,			// sType
3223 		DE_NULL,								// pNext
3224 		0u,										// waitSemaphoreCount
3225 		DE_NULL,								// pWaitSemaphores
3226 		(const VkPipelineStageFlags*)DE_NULL,	// pWaitDstStageMask
3227 		1u,										// commandBufferCount
3228 		&cmdBuffer1.get(),						// pCommandBuffers
3229 		0u,										// signalSemaphoreCount
3230 		DE_NULL									// pSignalSemaphores
3231 	};
3232 
3233 	VkSubmitInfo	submitInfo2 =
3234 	{
3235 		VK_STRUCTURE_TYPE_SUBMIT_INFO,			// sType
3236 		DE_NULL,								// pNext
3237 		0u,										// waitSemaphoreCount
3238 		DE_NULL,								// pWaitSemaphores
3239 		(const VkPipelineStageFlags*)DE_NULL,	// pWaitDstStageMask
3240 		1u,										// commandBufferCount
3241 		&cmdBuffer2.get(),						// pCommandBuffers
3242 		0u,										// signalSemaphoreCount
3243 		DE_NULL									// pSignalSemaphores
3244 	};
3245 
3246 	// Wait for completion
3247 	const Unique<VkFence>	fence1(createFence(vk, *logicalDevice));
3248 	const Unique<VkFence>	fence2(createFence(vk, *logicalDevice));
3249 
3250 	VK_CHECK(vk.queueSubmit(queues[0].queue, 1u, &submitInfo1, *fence1));
3251 	VK_CHECK(vk.queueSubmit(queues[1].queue, 1u, &submitInfo2, *fence2));
3252 
3253 	int err = ERROR_NONE;
3254 
3255 	// First wait for the low-priority queue
3256 	if (VK_SUCCESS != vk.waitForFences(*logicalDevice, 1u, &fence2.get(), DE_TRUE, ~0ull))
3257 		err = ERROR_WAIT;
3258 
3259 	// If the high-priority queue hasn't finished, we have a problem.
3260 	if (VK_SUCCESS != vk.getFenceStatus(*logicalDevice, fence1.get()))
3261 		if (err == ERROR_NONE)
3262 			err = ERROR_ORDER;
3263 
3264 	// Wait for the high-priority fence so we don't get errors on teardown.
3265 	vk.waitForFences(*logicalDevice, 1u, &fence1.get(), DE_TRUE, ~0ull);
3266 
3267 	// If we fail() before waiting for all of the fences, error will come from
3268 	// teardown instead of the error we want.
3269 
3270 	if (err == ERROR_WAIT)
3271 		return tcu::TestStatus::fail("Failed waiting for low-priority queue fence.");
3272 
3273 	// Validate the results
3274 
3275 	const Allocation& bufferAllocation1	= buffer1.getAllocation();
3276 	invalidateAlloc(vk, *logicalDevice, bufferAllocation1);
3277 	const deUint32* bufferPtr1			= static_cast<deUint32*>(bufferAllocation1.getHostPtr());
3278 
3279 	const Allocation& bufferAllocation2	= buffer2.getAllocation();
3280 	invalidateAlloc(vk, *logicalDevice, bufferAllocation2);
3281 	const deUint32* bufferPtr2			= static_cast<deUint32*>(bufferAllocation2.getHostPtr());
3282 
3283 	for (deUint32 ndx = 0; ndx < numValues; ++ndx)
3284 	{
3285 		const deUint32 res1	= bufferPtr1[ndx];
3286 		const deUint32 res2	= bufferPtr2[ndx];
3287 		const deUint32 inp	= inputData[ndx];
3288 		const deUint32 ref	= ~inp;
3289 
3290 		if (res1 != ref || res1 != res2)
3291 		{
3292 			std::ostringstream msg;
3293 			msg << "Comparison failed for InOut.values[" << ndx << "] ref:" << ref <<" res1:" << res1 << " res2:" << res2 << " inp:" << inp;
3294 			return tcu::TestStatus::fail(msg.str());
3295 		}
3296 	}
3297 
3298 	if (err == ERROR_ORDER)
3299 		log << tcu::TestLog::Message << "Note: Low-priority queue was faster than high-priority one. This is not an error, but priorities may be inverted." << tcu::TestLog::EndMessage;
3300 
3301 	return tcu::TestStatus::pass("Test passed");
3302 }
3303 
3304 class EmptyWorkGroupCase : public vkt::TestCase
3305 {
3306 public:
3307 					EmptyWorkGroupCase		(tcu::TestContext& testCtx, const std::string& name, const std::string& description, const tcu::UVec3& dispatchSize);
~EmptyWorkGroupCase(void)3308 	virtual			~EmptyWorkGroupCase		(void) {}
3309 
3310 	TestInstance*	createInstance			(Context& context) const override;
3311 	void			initPrograms			(vk::SourceCollections& programCollection) const override;
3312 
3313 protected:
3314 	const tcu::UVec3 m_dispatchSize;
3315 };
3316 
3317 class EmptyWorkGroupInstance : public vkt::TestInstance
3318 {
3319 public:
EmptyWorkGroupInstance(Context & context,const tcu::UVec3 & dispatchSize)3320 						EmptyWorkGroupInstance	(Context& context, const tcu::UVec3& dispatchSize)
3321 							: vkt::TestInstance	(context)
3322 							, m_dispatchSize	(dispatchSize)
3323 							{}
~EmptyWorkGroupInstance(void)3324 	virtual				~EmptyWorkGroupInstance	(void) {}
3325 
3326 	tcu::TestStatus		iterate					(void) override;
3327 
3328 protected:
3329 	const tcu::UVec3 m_dispatchSize;
3330 };
3331 
EmptyWorkGroupCase(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::UVec3 & dispatchSize)3332 EmptyWorkGroupCase::EmptyWorkGroupCase (tcu::TestContext& testCtx, const std::string& name, const std::string& description, const tcu::UVec3& dispatchSize)
3333 	: vkt::TestCase		(testCtx, name, description)
3334 	, m_dispatchSize	(dispatchSize)
3335 {
3336 	DE_ASSERT(m_dispatchSize.x() == 0u || m_dispatchSize.y() == 0u || m_dispatchSize.z() == 0u);
3337 }
3338 
createInstance(Context & context) const3339 TestInstance* EmptyWorkGroupCase::createInstance (Context& context) const
3340 {
3341 	return new EmptyWorkGroupInstance(context, m_dispatchSize);
3342 }
3343 
initPrograms(vk::SourceCollections & programCollection) const3344 void EmptyWorkGroupCase::initPrograms (vk::SourceCollections& programCollection) const
3345 {
3346 	std::ostringstream comp;
3347 	comp
3348 		<< "#version 450\n"
3349 		<< "layout (local_size_x=1, local_size_y=1, local_size_z=1) in;\n"
3350 		<< "layout (set=0, binding=0) buffer VerificationBlock { uint value; } verif;\n"
3351 		<< "void main () { atomicAdd(verif.value, 1u); }\n"
3352 		;
3353 	programCollection.glslSources.add("comp") << glu::ComputeSource(comp.str());
3354 }
3355 
iterate(void)3356 tcu::TestStatus EmptyWorkGroupInstance::iterate (void)
3357 {
3358 	const auto&		vkd				= m_context.getDeviceInterface();
3359 	const auto		device			= m_context.getDevice();
3360 	auto&			alloc			= m_context.getDefaultAllocator();
3361 	const auto		queueIndex		= m_context.getUniversalQueueFamilyIndex();
3362 	const auto		queue			= m_context.getUniversalQueue();
3363 
3364 	const auto			verifBufferSize		= static_cast<VkDeviceSize>(sizeof(uint32_t));
3365 	const auto			verifBufferInfo		= makeBufferCreateInfo(verifBufferSize, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
3366 	BufferWithMemory	verifBuffer			(vkd, device, alloc, verifBufferInfo, MemoryRequirement::HostVisible);
3367 	auto&				verifBufferAlloc	= verifBuffer.getAllocation();
3368 	void*				verifBufferPtr		= verifBufferAlloc.getHostPtr();
3369 
3370 	deMemset(verifBufferPtr, 0, static_cast<size_t>(verifBufferSize));
3371 	flushAlloc(vkd, device, verifBufferAlloc);
3372 
3373 	DescriptorSetLayoutBuilder layoutBuilder;
3374 	layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT);
3375 	const auto descriptorSetLayout = layoutBuilder.build(vkd, device);
3376 
3377 	const auto pipelineLayout	= makePipelineLayout(vkd, device, descriptorSetLayout.get());
3378 	const auto shaderModule		= createShaderModule(vkd, device, m_context.getBinaryCollection().get("comp"), 0u);
3379 	const auto pipeline			= makeComputePipeline(vkd, device, pipelineLayout.get(), shaderModule.get());
3380 
3381 	DescriptorPoolBuilder poolBuilder;
3382 	poolBuilder.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
3383 	const auto descriptorPool	= poolBuilder.build(vkd, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u);
3384 	const auto descriptorSet	= makeDescriptorSet(vkd, device, descriptorPool.get(), descriptorSetLayout.get());
3385 
3386 	DescriptorSetUpdateBuilder updateBuilder;
3387 	const auto verifBufferDescInfo = makeDescriptorBufferInfo(verifBuffer.get(), 0ull, verifBufferSize);
3388 	updateBuilder.writeSingle(descriptorSet.get(), DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &verifBufferDescInfo);
3389 	updateBuilder.update(vkd, device);
3390 
3391 	const auto cmdPool = makeCommandPool(vkd, device, queueIndex);
3392 	const auto cmdBufferPtr = allocateCommandBuffer(vkd, device, cmdPool.get(), VK_COMMAND_BUFFER_LEVEL_PRIMARY);
3393 	const auto cmdBuffer = cmdBufferPtr.get();
3394 
3395 	beginCommandBuffer(vkd, cmdBuffer);
3396 	vkd.cmdBindPipeline(cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.get());
3397 	vkd.cmdBindDescriptorSets(cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayout.get(), 0u, 1u, &descriptorSet.get(), 0u, nullptr);
3398 	vkd.cmdDispatch(cmdBuffer, m_dispatchSize.x(), m_dispatchSize.y(), m_dispatchSize.z());
3399 
3400 	const auto readWriteAccess	= (VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_SHADER_READ_BIT);
3401 	const auto computeToCompute = makeMemoryBarrier(readWriteAccess, readWriteAccess);
3402 	vkd.cmdPipelineBarrier(cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0U, 1u, &computeToCompute, 0u, nullptr, 0u, nullptr);
3403 
3404 	vkd.cmdDispatch(cmdBuffer, 1u, 1u, 1u);
3405 
3406 	const auto computeToHost = makeMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT);
3407 	vkd.cmdPipelineBarrier(cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0u, 1u, &computeToHost, 0u, nullptr, 0u, nullptr);
3408 
3409 	endCommandBuffer(vkd, cmdBuffer);
3410 	submitCommandsAndWait(vkd, device, queue, cmdBuffer);
3411 
3412 	uint32_t value;
3413 	invalidateAlloc(vkd, device, verifBufferAlloc);
3414 	deMemcpy(&value, verifBufferPtr, sizeof(value));
3415 
3416 	if (value != 1u)
3417 	{
3418 		std::ostringstream msg;
3419 		msg << "Unexpected value found in buffer: " << value << " while expecting 1";
3420 		TCU_FAIL(msg.str());
3421 	}
3422 
3423 	return tcu::TestStatus::pass("Pass");
3424 }
3425 
3426 class MaxWorkGroupSizeTest : public vkt::TestCase
3427 {
3428 public:
3429 	enum class Axis	{ X = 0, Y = 1, Z = 2 };
3430 
3431 	struct Params
3432 	{
3433 		// Which axis to maximize.
3434 		Axis axis;
3435 	};
3436 
3437 							MaxWorkGroupSizeTest	(tcu::TestContext& testCtx, const std::string& name, const std::string& description, const Params& params);
~MaxWorkGroupSizeTest(void)3438 	virtual					~MaxWorkGroupSizeTest	(void) {}
3439 
3440 	virtual void			initPrograms			(vk::SourceCollections& programCollection) const;
3441 	virtual TestInstance*	createInstance			(Context& context) const;
3442 	virtual void			checkSupport			(Context& context) const;
3443 
3444 	// Helper to transform the axis value to an index.
3445 	static int				getIndex				(Axis axis);
3446 
3447 	// Helper returning the number of invocations according to the test parameters.
3448 	static deUint32			getInvocations			(const Params& params, const vk::InstanceInterface& vki, vk::VkPhysicalDevice physicalDevice, const vk::VkPhysicalDeviceProperties* devProperties = nullptr);
3449 
3450 	// Helper returning the buffer size needed to this test.
3451 	static deUint32			getSSBOSize				(deUint32 invocations);
3452 
3453 private:
3454 	Params m_params;
3455 };
3456 
3457 class MaxWorkGroupSizeInstance : public vkt::TestInstance
3458 {
3459 public:
3460 								MaxWorkGroupSizeInstance	(Context& context, const MaxWorkGroupSizeTest::Params& params);
~MaxWorkGroupSizeInstance(void)3461 	virtual						~MaxWorkGroupSizeInstance	(void) {}
3462 
3463 	virtual tcu::TestStatus		iterate			(void);
3464 
3465 private:
3466 	MaxWorkGroupSizeTest::Params m_params;
3467 };
3468 
getIndex(Axis axis)3469 int MaxWorkGroupSizeTest::getIndex (Axis axis)
3470 {
3471 	const int ret = static_cast<int>(axis);
3472 	DE_ASSERT(ret >= static_cast<int>(Axis::X) && ret <= static_cast<int>(Axis::Z));
3473 	return ret;
3474 }
3475 
getInvocations(const Params & params,const vk::InstanceInterface & vki,vk::VkPhysicalDevice physicalDevice,const vk::VkPhysicalDeviceProperties * devProperties)3476 deUint32 MaxWorkGroupSizeTest::getInvocations (const Params& params, const vk::InstanceInterface& vki, vk::VkPhysicalDevice physicalDevice, const vk::VkPhysicalDeviceProperties* devProperties)
3477 {
3478 	const auto axis = getIndex(params.axis);
3479 
3480 	if (devProperties)
3481 		return devProperties->limits.maxComputeWorkGroupSize[axis];
3482 	return vk::getPhysicalDeviceProperties(vki, physicalDevice).limits.maxComputeWorkGroupSize[axis];
3483 }
3484 
getSSBOSize(deUint32 invocations)3485 deUint32 MaxWorkGroupSizeTest::getSSBOSize (deUint32 invocations)
3486 {
3487 	return invocations * static_cast<deUint32>(sizeof(deUint32));
3488 }
3489 
MaxWorkGroupSizeTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const Params & params)3490 MaxWorkGroupSizeTest::MaxWorkGroupSizeTest (tcu::TestContext& testCtx, const std::string& name, const std::string& description, const Params& params)
3491 	: vkt::TestCase	(testCtx, name, description)
3492 	, m_params		(params)
3493 {}
3494 
initPrograms(vk::SourceCollections & programCollection) const3495 void MaxWorkGroupSizeTest::initPrograms (vk::SourceCollections& programCollection) const
3496 {
3497 	std::ostringstream shader;
3498 
3499 	// The actual local sizes will be set using spec constants when running the test instance.
3500 	shader
3501 		<< "#version 450\n"
3502 		<< "\n"
3503 		<< "layout(constant_id=0) const int local_size_x_val = 1;\n"
3504 		<< "layout(constant_id=1) const int local_size_y_val = 1;\n"
3505 		<< "layout(constant_id=2) const int local_size_z_val = 1;\n"
3506 		<< "\n"
3507 		<< "layout(local_size_x_id=0, local_size_y_id=1, local_size_z_id=2) in;\n"
3508 		<< "\n"
3509 		<< "layout(set=0, binding=0) buffer StorageBuffer {\n"
3510 		<< "    uint values[];\n"
3511 		<< "} ssbo;\n"
3512 		<< "\n"
3513 		<< "void main() {\n"
3514 		<< "    ssbo.values[gl_LocalInvocationIndex] = 1u;\n"
3515 		<< "}\n"
3516 		;
3517 
3518 	programCollection.glslSources.add("comp") << glu::ComputeSource(shader.str());
3519 }
3520 
createInstance(Context & context) const3521 TestInstance* MaxWorkGroupSizeTest::createInstance (Context& context) const
3522 {
3523 	return new MaxWorkGroupSizeInstance(context, m_params);
3524 }
3525 
checkSupport(Context & context) const3526 void MaxWorkGroupSizeTest::checkSupport (Context& context) const
3527 {
3528 	const auto&	vki				= context.getInstanceInterface();
3529 	const auto	physicalDevice	= context.getPhysicalDevice();
3530 
3531 	const auto	properties		= vk::getPhysicalDeviceProperties(vki, physicalDevice);
3532 	const auto	invocations		= getInvocations(m_params, vki, physicalDevice, &properties);
3533 
3534 	if (invocations > properties.limits.maxComputeWorkGroupInvocations)
3535 		TCU_FAIL("Reported workgroup size limit in the axis is greater than the global invocation limit");
3536 
3537 	if (properties.limits.maxStorageBufferRange / static_cast<deUint32>(sizeof(deUint32)) < invocations)
3538 		TCU_THROW(NotSupportedError, "Maximum supported storage buffer range too small");
3539 }
3540 
MaxWorkGroupSizeInstance(Context & context,const MaxWorkGroupSizeTest::Params & params)3541 MaxWorkGroupSizeInstance::MaxWorkGroupSizeInstance (Context& context, const MaxWorkGroupSizeTest::Params& params)
3542 	: vkt::TestInstance	(context)
3543 	, m_params			(params)
3544 {}
3545 
iterate(void)3546 tcu::TestStatus MaxWorkGroupSizeInstance::iterate (void)
3547 {
3548 	const auto&	vki				= m_context.getInstanceInterface();
3549 	const auto&	vkd				= m_context.getDeviceInterface();
3550 	const auto	physicalDevice	= m_context.getPhysicalDevice();
3551 	const auto	device			= m_context.getDevice();
3552 	auto&		alloc			= m_context.getDefaultAllocator();
3553 	const auto	queueIndex		= m_context.getUniversalQueueFamilyIndex();
3554 	const auto	queue			= m_context.getUniversalQueue();
3555 	auto&		log				= m_context.getTestContext().getLog();
3556 
3557 	const auto	axis			= MaxWorkGroupSizeTest::getIndex(m_params.axis);
3558 	const auto	invocations		= MaxWorkGroupSizeTest::getInvocations(m_params, vki, physicalDevice);
3559 	const auto	ssboSize		= static_cast<vk::VkDeviceSize>(MaxWorkGroupSizeTest::getSSBOSize(invocations));
3560 
3561 	log
3562 		<< tcu::TestLog::Message
3563 		<< "Running test with " << invocations << " invocations on axis " << axis << " using a storage buffer size of " << ssboSize << " bytes"
3564 		<< tcu::TestLog::EndMessage
3565 		;
3566 
3567 	// Main SSBO buffer.
3568 	const auto				ssboInfo	= vk::makeBufferCreateInfo(ssboSize, vk::VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
3569 	vk::BufferWithMemory	ssbo		(vkd, device, alloc, ssboInfo, vk::MemoryRequirement::HostVisible);
3570 
3571 	// Shader module.
3572 	const auto shaderModule	= vk::createShaderModule(vkd, device, m_context.getBinaryCollection().get("comp"), 0u);
3573 
3574 	// Descriptor set layouts.
3575 	vk::DescriptorSetLayoutBuilder layoutBuilder;
3576 	layoutBuilder.addSingleBinding(vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, vk::VK_SHADER_STAGE_COMPUTE_BIT);
3577 	const auto descriptorSetLayout = layoutBuilder.build(vkd, device);
3578 
3579 	// Specialization constants: set the number of invocations in the appropriate local size id.
3580 	const auto	entrySize				= static_cast<deUintptr>(sizeof(deInt32));
3581 	deInt32		specializationData[3]	= { 1, 1, 1 };
3582 	specializationData[axis] = static_cast<deInt32>(invocations);
3583 
3584 	const vk::VkSpecializationMapEntry specializationMaps[3] =
3585 	{
3586 		{
3587 			0u,										//	deUint32	constantID;
3588 			0u,										//	deUint32	offset;
3589 			entrySize,								//	deUintptr	size;
3590 		},
3591 		{
3592 			1u,										//	deUint32	constantID;
3593 			static_cast<deUint32>(entrySize),		//	deUint32	offset;
3594 			entrySize,								//	deUintptr	size;
3595 		},
3596 		{
3597 			2u,										//	deUint32	constantID;
3598 			static_cast<deUint32>(entrySize * 2u),	//	deUint32	offset;
3599 			entrySize,								//	deUintptr	size;
3600 		},
3601 	};
3602 
3603 	const vk::VkSpecializationInfo specializationInfo =
3604 	{
3605 		3u,													//	deUint32						mapEntryCount;
3606 		specializationMaps,									//	const VkSpecializationMapEntry*	pMapEntries;
3607 		static_cast<deUintptr>(sizeof(specializationData)),	//	deUintptr						dataSize;
3608 		specializationData,									//	const void*						pData;
3609 	};
3610 
3611 	// Test pipeline.
3612 	const vk::VkPipelineLayoutCreateInfo testPipelineLayoutInfo =
3613 	{
3614 		vk::VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,	//	VkStructureType					sType;
3615 		nullptr,											//	const void*						pNext;
3616 		0u,													//	VkPipelineLayoutCreateFlags		flags;
3617 		1u,													//	deUint32						setLayoutCount;
3618 		&descriptorSetLayout.get(),							//	const VkDescriptorSetLayout*	pSetLayouts;
3619 		0u,													//	deUint32						pushConstantRangeCount;
3620 		nullptr,											//	const VkPushConstantRange*		pPushConstantRanges;
3621 	};
3622 	const auto testPipelineLayout = vk::createPipelineLayout(vkd, device, &testPipelineLayoutInfo);
3623 
3624 	const vk::VkComputePipelineCreateInfo testPipelineInfo =
3625 	{
3626 		vk::VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,	//	VkStructureType					sType;
3627 		nullptr,											//	const void*						pNext;
3628 		0u,													//	VkPipelineCreateFlags			flags;
3629 		{													//	VkPipelineShaderStageCreateInfo	stage;
3630 			vk::VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,//	VkStructureType						sType;
3631 			nullptr,												//	const void*							pNext;
3632 			0u,														//	VkPipelineShaderStageCreateFlags	flags;
3633 			vk::VK_SHADER_STAGE_COMPUTE_BIT,						//	VkShaderStageFlagBits				stage;
3634 			shaderModule.get(),										//	VkShaderModule						module;
3635 			"main",													//	const char*							pName;
3636 			&specializationInfo,									//	const VkSpecializationInfo*			pSpecializationInfo;
3637 		},
3638 		testPipelineLayout.get(),							//	VkPipelineLayout				layout;
3639 		DE_NULL,											//	VkPipeline						basePipelineHandle;
3640 		0u,													//	deInt32							basePipelineIndex;
3641 	};
3642 	const auto testPipeline = vk::createComputePipeline(vkd, device, DE_NULL, &testPipelineInfo);
3643 
3644 	// Create descriptor pool and set.
3645 	vk::DescriptorPoolBuilder poolBuilder;
3646 	poolBuilder.addType(vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
3647 	const auto descriptorPool	= poolBuilder.build(vkd, device, vk::VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u);
3648 	const auto descriptorSet	= vk::makeDescriptorSet(vkd, device, descriptorPool.get(), descriptorSetLayout.get());
3649 
3650 	// Update descriptor set.
3651 	const vk::VkDescriptorBufferInfo ssboBufferInfo =
3652 	{
3653 		ssbo.get(),		//	VkBuffer		buffer;
3654 		0u,				//	VkDeviceSize	offset;
3655 		VK_WHOLE_SIZE,	//	VkDeviceSize	range;
3656 	};
3657 
3658 	vk::DescriptorSetUpdateBuilder updateBuilder;
3659 	updateBuilder.writeSingle(descriptorSet.get(), vk::DescriptorSetUpdateBuilder::Location::binding(0u), vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &ssboBufferInfo);
3660 	updateBuilder.update(vkd, device);
3661 
3662 	// Clear buffer.
3663 	auto& ssboAlloc	= ssbo.getAllocation();
3664 	void* ssboPtr	= ssboAlloc.getHostPtr();
3665 	deMemset(ssboPtr, 0, static_cast<size_t>(ssboSize));
3666 	vk::flushAlloc(vkd, device, ssboAlloc);
3667 
3668 	// Run pipelines.
3669 	const auto cmdPool		= vk::makeCommandPool(vkd, device, queueIndex);
3670 	const auto cmdBUfferPtr	= vk::allocateCommandBuffer(vkd, device, cmdPool.get(), vk::VK_COMMAND_BUFFER_LEVEL_PRIMARY);
3671 	const auto cmdBuffer	= cmdBUfferPtr.get();
3672 
3673 	vk::beginCommandBuffer(vkd, cmdBuffer);
3674 
3675 	// Run the main test shader.
3676 	const auto hostToComputeBarrier = vk::makeBufferMemoryBarrier(vk::VK_ACCESS_HOST_WRITE_BIT, vk::VK_ACCESS_SHADER_WRITE_BIT, ssbo.get(), 0ull, VK_WHOLE_SIZE);
3677 	vkd.cmdPipelineBarrier(cmdBuffer, vk::VK_PIPELINE_STAGE_HOST_BIT, vk::VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0u, 0u, nullptr, 1u, &hostToComputeBarrier, 0u, nullptr);
3678 
3679 	vkd.cmdBindPipeline(cmdBuffer, vk::VK_PIPELINE_BIND_POINT_COMPUTE, testPipeline.get());
3680 	vkd.cmdBindDescriptorSets(cmdBuffer, vk::VK_PIPELINE_BIND_POINT_COMPUTE, testPipelineLayout.get(), 0u, 1u, &descriptorSet.get(), 0u, nullptr);
3681 	vkd.cmdDispatch(cmdBuffer, 1u, 1u, 1u);
3682 
3683 	const auto computeToHostBarrier = vk::makeBufferMemoryBarrier(vk::VK_ACCESS_SHADER_WRITE_BIT, vk::VK_ACCESS_HOST_READ_BIT, ssbo.get(), 0ull, VK_WHOLE_SIZE);
3684 	vkd.cmdPipelineBarrier(cmdBuffer, vk::VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, vk::VK_PIPELINE_STAGE_HOST_BIT, 0u, 0u, nullptr, 1u, &computeToHostBarrier, 0u, nullptr);
3685 
3686 	vk::endCommandBuffer(vkd, cmdBuffer);
3687 	vk::submitCommandsAndWait(vkd, device, queue, cmdBuffer);
3688 
3689 	// Verify buffer contents.
3690 	vk::invalidateAlloc(vkd, device, ssboAlloc);
3691 	std::unique_ptr<deUint32[]>	valuesArray	(new deUint32[invocations]);
3692 	deUint32*					valuesPtr	= valuesArray.get();
3693 	deMemcpy(valuesPtr, ssboPtr, static_cast<size_t>(ssboSize));
3694 
3695 	std::string	errorMsg;
3696 	bool		ok			= true;
3697 
3698 	for (size_t i = 0; i < invocations; ++i)
3699 	{
3700 		if (valuesPtr[i] != 1u)
3701 		{
3702 			ok			= false;
3703 			errorMsg	= "Found invalid value for invocation index " + de::toString(i) + ": expected 1u and found " + de::toString(valuesPtr[i]);
3704 			break;
3705 		}
3706 	}
3707 
3708 	if (!ok)
3709 		return tcu::TestStatus::fail(errorMsg);
3710 	return tcu::TestStatus::pass("Pass");
3711 }
3712 
3713 namespace EmptyShaderTest
3714 {
3715 
createProgram(SourceCollections & dst)3716 void createProgram (SourceCollections& dst)
3717 {
3718 	dst.glslSources.add("comp") << glu::ComputeSource(
3719 		"#version 310 es\n"
3720 		"layout (local_size_x = 1) in;\n"
3721 		"void main (void) {}\n"
3722 	);
3723 }
3724 
createTest(Context & context)3725 tcu::TestStatus createTest (Context& context)
3726 {
3727 	const DeviceInterface&	vk					= context.getDeviceInterface();
3728 	const VkDevice			device				= context.getDevice();
3729 	const VkQueue			queue				= context.getUniversalQueue();
3730 	const deUint32			queueFamilyIndex	= context.getUniversalQueueFamilyIndex();
3731 
3732 	const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, context.getBinaryCollection().get("comp"), 0u));
3733 
3734 	const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device));
3735 	const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
3736 
3737 	const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
3738 	const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3739 
3740 	// Start recording commands
3741 
3742 	beginCommandBuffer(vk, *cmdBuffer);
3743 
3744 	vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
3745 
3746 	const tcu::IVec3 workGroups(1, 1, 1);
3747 	vk.cmdDispatch(*cmdBuffer, workGroups.x(), workGroups.y(), workGroups.z());
3748 
3749 	endCommandBuffer(vk, *cmdBuffer);
3750 
3751 	submitCommandsAndWait(vk, device, queue, *cmdBuffer);
3752 
3753 	return tcu::TestStatus::pass("Compute succeeded");
3754 }
3755 
3756 } // EmptyShaderTest ns
3757 } // anonymous
3758 
createBasicComputeShaderTests(tcu::TestContext & testCtx)3759 tcu::TestCaseGroup* createBasicComputeShaderTests (tcu::TestContext& testCtx)
3760 {
3761 	de::MovePtr<tcu::TestCaseGroup> basicComputeTests(new tcu::TestCaseGroup(testCtx, "basic", "Basic compute tests"));
3762 
3763 	addFunctionCaseWithPrograms(basicComputeTests.get(), "empty_shader", "Shader that does nothing", EmptyShaderTest::createProgram, EmptyShaderTest::createTest);
3764 
3765 	basicComputeTests->addChild(new ConcurrentCompute(testCtx, "concurrent_compute", "Concurrent compute test"));
3766 
3767 	basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_x", "Use an empty workgroup with size 0 on the X axis", tcu::UVec3(0u, 2u, 3u)));
3768 	basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_y", "Use an empty workgroup with size 0 on the Y axis", tcu::UVec3(2u, 0u, 3u)));
3769 	basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_z", "Use an empty workgroup with size 0 on the Z axis", tcu::UVec3(2u, 3u, 0u)));
3770 	basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_all", "Use an empty workgroup with size 0 on the X, Y and Z axes", tcu::UVec3(0u, 0u, 0u)));
3771 
3772 	basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_x", "Use the maximum work group size on the X axis", MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::X}));
3773 	basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_y", "Use the maximum work group size on the Y axis", MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::Y}));
3774 	basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_z", "Use the maximum work group size on the Z axis", MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::Z}));
3775 
3776 	basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx,	"ubo_to_ssbo_single_invocation",	"Copy from UBO to SSBO, inverting bits",	256,	tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1)));
3777 	basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx,	"ubo_to_ssbo_single_group",			"Copy from UBO to SSBO, inverting bits",	1024,	tcu::IVec3(2,1,4),	tcu::IVec3(1,1,1)));
3778 	basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx,	"ubo_to_ssbo_multiple_invocations",	"Copy from UBO to SSBO, inverting bits",	1024,	tcu::IVec3(1,1,1),	tcu::IVec3(2,4,1)));
3779 	basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx,	"ubo_to_ssbo_multiple_groups",		"Copy from UBO to SSBO, inverting bits",	1024,	tcu::IVec3(1,4,2),	tcu::IVec3(2,2,4)));
3780 
3781 	basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx,	"copy_ssbo_single_invocation",		"Copy between SSBOs, inverting bits",	256,	tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1)));
3782 	basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx,	"copy_ssbo_multiple_invocations",	"Copy between SSBOs, inverting bits",	1024,	tcu::IVec3(1,1,1),	tcu::IVec3(2,4,1)));
3783 	basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx,	"copy_ssbo_multiple_groups",		"Copy between SSBOs, inverting bits",	1024,	tcu::IVec3(1,4,2),	tcu::IVec3(2,2,4)));
3784 
3785 	basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx,	"ssbo_rw_single_invocation",			"Read and write same SSBO",		256,	true,	tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1)));
3786 	basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx,	"ssbo_rw_multiple_groups",				"Read and write same SSBO",		1024,	true,	tcu::IVec3(1,4,2),	tcu::IVec3(2,2,4)));
3787 	basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx,	"ssbo_unsized_arr_single_invocation",	"Read and write same SSBO",		256,	false,	tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1)));
3788 	basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx,	"ssbo_unsized_arr_multiple_groups",		"Read and write same SSBO",		1024,	false,	tcu::IVec3(1,4,2),	tcu::IVec3(2,2,4)));
3789 
3790 	basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx,	"write_multiple_arr_single_invocation",			"Write to multiple SSBOs",	256,	true,	tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1)));
3791 	basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx,	"write_multiple_arr_multiple_groups",			"Write to multiple SSBOs",	1024,	true,	tcu::IVec3(1,4,2),	tcu::IVec3(2,2,4)));
3792 	basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx,	"write_multiple_unsized_arr_single_invocation",	"Write to multiple SSBOs",	256,	false,	tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1)));
3793 	basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx,	"write_multiple_unsized_arr_multiple_groups",	"Write to multiple SSBOs",	1024,	false,	tcu::IVec3(1,4,2),	tcu::IVec3(2,2,4)));
3794 
3795 	basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx,	"ssbo_local_barrier_single_invocation",	"SSBO local barrier usage",	tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1)));
3796 	basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx,	"ssbo_local_barrier_single_group",		"SSBO local barrier usage",	tcu::IVec3(3,2,5),	tcu::IVec3(1,1,1)));
3797 	basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx,	"ssbo_local_barrier_multiple_groups",	"SSBO local barrier usage",	tcu::IVec3(3,4,1),	tcu::IVec3(2,7,3)));
3798 
3799 	basicComputeTests->addChild(new SSBOBarrierTest(testCtx,	"ssbo_cmd_barrier_single",		"SSBO memory barrier usage",	tcu::IVec3(1,1,1)));
3800 	basicComputeTests->addChild(new SSBOBarrierTest(testCtx,	"ssbo_cmd_barrier_multiple",	"SSBO memory barrier usage",	tcu::IVec3(11,5,7)));
3801 
3802 	basicComputeTests->addChild(new SharedVarTest(testCtx,	"shared_var_single_invocation",		"Basic shared variable usage",	tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1)));
3803 	basicComputeTests->addChild(new SharedVarTest(testCtx,	"shared_var_single_group",			"Basic shared variable usage",	tcu::IVec3(3,2,5),	tcu::IVec3(1,1,1)));
3804 	basicComputeTests->addChild(new SharedVarTest(testCtx,	"shared_var_multiple_invocations",	"Basic shared variable usage",	tcu::IVec3(1,1,1),	tcu::IVec3(2,5,4)));
3805 	basicComputeTests->addChild(new SharedVarTest(testCtx,	"shared_var_multiple_groups",		"Basic shared variable usage",	tcu::IVec3(3,4,1),	tcu::IVec3(2,7,3)));
3806 
3807 	basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx,	"shared_atomic_op_single_invocation",		"Atomic operation with shared var",		tcu::IVec3(1,1,1),	tcu::IVec3(1,1,1)));
3808 	basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx,	"shared_atomic_op_single_group",			"Atomic operation with shared var",		tcu::IVec3(3,2,5),	tcu::IVec3(1,1,1)));
3809 	basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx,	"shared_atomic_op_multiple_invocations",	"Atomic operation with shared var",		tcu::IVec3(1,1,1),	tcu::IVec3(2,5,4)));
3810 	basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx,	"shared_atomic_op_multiple_groups",			"Atomic operation with shared var",		tcu::IVec3(3,4,1),	tcu::IVec3(2,7,3)));
3811 
3812 	basicComputeTests->addChild(new CopyImageToSSBOTest(testCtx,	"copy_image_to_ssbo_small",	"Image to SSBO copy",	tcu::IVec2(1,1),	tcu::IVec2(64,64)));
3813 	basicComputeTests->addChild(new CopyImageToSSBOTest(testCtx,	"copy_image_to_ssbo_large",	"Image to SSBO copy",	tcu::IVec2(2,4),	tcu::IVec2(512,512)));
3814 
3815 	basicComputeTests->addChild(new CopySSBOToImageTest(testCtx,	"copy_ssbo_to_image_small",	"SSBO to image copy",	tcu::IVec2(1, 1),	tcu::IVec2(64, 64)));
3816 	basicComputeTests->addChild(new CopySSBOToImageTest(testCtx,	"copy_ssbo_to_image_large",	"SSBO to image copy",	tcu::IVec2(2, 4),	tcu::IVec2(512, 512)));
3817 
3818 	basicComputeTests->addChild(new ImageAtomicOpTest(testCtx,	"image_atomic_op_local_size_1",	"Atomic operation with image",	1,	tcu::IVec2(64,64)));
3819 	basicComputeTests->addChild(new ImageAtomicOpTest(testCtx,	"image_atomic_op_local_size_8",	"Atomic operation with image",	8,	tcu::IVec2(64,64)));
3820 
3821 	basicComputeTests->addChild(new ImageBarrierTest(testCtx,	"image_barrier_single",		"Image barrier",	tcu::IVec2(1,1)));
3822 	basicComputeTests->addChild(new ImageBarrierTest(testCtx,	"image_barrier_multiple",	"Image barrier",	tcu::IVec2(64,64)));
3823 
3824 	basicComputeTests->addChild(cts_amber::createAmberTestCase(testCtx, "write_ssbo_array", "", "compute", "write_ssbo_array.amber"));
3825 
3826 	return basicComputeTests.release();
3827 }
3828 
createBasicDeviceGroupComputeShaderTests(tcu::TestContext & testCtx)3829 tcu::TestCaseGroup* createBasicDeviceGroupComputeShaderTests (tcu::TestContext& testCtx)
3830 {
3831 	de::MovePtr<tcu::TestCaseGroup> deviceGroupComputeTests(new tcu::TestCaseGroup(testCtx, "device_group", "Basic device group compute tests"));
3832 
3833 	deviceGroupComputeTests->addChild(new DispatchBaseTest(testCtx,	"dispatch_base",	"Compute shader with base groups",				32768,	tcu::IVec3(4,2,4),	tcu::IVec3(16,8,8),	tcu::IVec3(4,8,8)));
3834 	deviceGroupComputeTests->addChild(new DeviceIndexTest(testCtx,	"device_index",		"Compute shader using deviceIndex in SPIRV",	96,		tcu::IVec3(3,2,1),	tcu::IVec3(2,4,1)));
3835 
3836 	return deviceGroupComputeTests.release();
3837 
3838 }
3839 } // compute
3840 } // vkt
3841