• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*------------------------------------------------------------------------
2  * Vulkan Conformance Tests
3  * ------------------------
4  *
5  * Copyright (c) 2019 The Khronos Group Inc.
6  * Copyright (c) 2019 The Android Open Source Project
7  * Copyright (c) 2023 LunarG, Inc.
8  * Copyright (c) 2023 Nintendo
9  *
10  * Licensed under the Apache License, Version 2.0 (the "License");
11  * you may not use this file except in compliance with the License.
12  * You may obtain a copy of the License at
13  *
14  *      http://www.apache.org/licenses/LICENSE-2.0
15  *
16  * Unless required by applicable law or agreed to in writing, software
17  * distributed under the License is distributed on an "AS IS" BASIS,
18  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19  * See the License for the specific language governing permissions and
20  * limitations under the License.
21  *
22  *//*!
23  * \file
24  * \brief Compute Shader Tests
25  *//*--------------------------------------------------------------------*/
26 
27 #include "vktComputeBasicComputeShaderTests.hpp"
28 #include "vktTestCase.hpp"
29 #include "vktTestCaseUtil.hpp"
30 #include "vktComputeTestsUtil.hpp"
31 #include "vktCustomInstancesDevices.hpp"
32 #include "vktAmberTestCase.hpp"
33 
34 #include "vkDefs.hpp"
35 #include "vkRef.hpp"
36 #include "vkRefUtil.hpp"
37 #include "vkPlatform.hpp"
38 #include "vkPrograms.hpp"
39 #include "vkRefUtil.hpp"
40 #include "vkMemUtil.hpp"
41 #include "vkBarrierUtil.hpp"
42 #include "vkQueryUtil.hpp"
43 #include "vkBuilderUtil.hpp"
44 #include "vkTypeUtil.hpp"
45 #include "vkDeviceUtil.hpp"
46 #include "vkCmdUtil.hpp"
47 #include "vkObjUtil.hpp"
48 #include "vkBufferWithMemory.hpp"
49 #include "vkSafetyCriticalUtil.hpp"
50 #include "vkImageWithMemory.hpp"
51 
52 #include "tcuCommandLine.hpp"
53 #include "tcuTestLog.hpp"
54 #include "tcuMaybe.hpp"
55 
56 #include "deMath.h"
57 #include "deRandom.hpp"
58 #include "deStringUtil.hpp"
59 #include "deUniquePtr.hpp"
60 
61 #include <vector>
62 #include <memory>
63 
64 using namespace vk;
65 
66 namespace vkt
67 {
68 namespace compute
69 {
70 namespace
71 {
72 
73 template <typename T, int size>
multiplyComponents(const tcu::Vector<T,size> & v)74 T multiplyComponents(const tcu::Vector<T, size> &v)
75 {
76     T accum = 1;
77     for (int i = 0; i < size; ++i)
78         accum *= v[i];
79     return accum;
80 }
81 
82 template <typename T>
squared(const T & a)83 inline T squared(const T &a)
84 {
85     return a * a;
86 }
87 
make2DImageCreateInfo(const tcu::IVec2 & imageSize,const VkImageUsageFlags usage)88 inline VkImageCreateInfo make2DImageCreateInfo(const tcu::IVec2 &imageSize, const VkImageUsageFlags usage)
89 {
90     const VkImageCreateInfo imageParams = {
91         VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,               // VkStructureType sType;
92         nullptr,                                           // const void* pNext;
93         0u,                                                // VkImageCreateFlags flags;
94         VK_IMAGE_TYPE_2D,                                  // VkImageType imageType;
95         VK_FORMAT_R32_UINT,                                // VkFormat format;
96         vk::makeExtent3D(imageSize.x(), imageSize.y(), 1), // VkExtent3D extent;
97         1u,                                                // uint32_t mipLevels;
98         1u,                                                // uint32_t arrayLayers;
99         VK_SAMPLE_COUNT_1_BIT,                             // VkSampleCountFlagBits samples;
100         VK_IMAGE_TILING_OPTIMAL,                           // VkImageTiling tiling;
101         usage,                                             // VkImageUsageFlags usage;
102         VK_SHARING_MODE_EXCLUSIVE,                         // VkSharingMode sharingMode;
103         0u,                                                // uint32_t queueFamilyIndexCount;
104         nullptr,                                           // const uint32_t* pQueueFamilyIndices;
105         VK_IMAGE_LAYOUT_UNDEFINED,                         // VkImageLayout initialLayout;
106     };
107     return imageParams;
108 }
109 
makeBufferImageCopy(const tcu::IVec2 & imageSize)110 inline VkBufferImageCopy makeBufferImageCopy(const tcu::IVec2 &imageSize)
111 {
112     return compute::makeBufferImageCopy(vk::makeExtent3D(imageSize.x(), imageSize.y(), 1), 1u);
113 }
114 
115 enum BufferType
116 {
117     BUFFER_TYPE_UNIFORM,
118     BUFFER_TYPE_SSBO,
119 };
120 
121 class SharedVarTest : public vkt::TestCase
122 {
123 public:
124     SharedVarTest(tcu::TestContext &testCtx, const std::string &name, const tcu::IVec3 &localSize,
125                   const tcu::IVec3 &workSize,
126                   const vk::ComputePipelineConstructionType computePipelineConstructionType);
127 
128     virtual void checkSupport(Context &context) const;
129     void initPrograms(SourceCollections &sourceCollections) const;
130     TestInstance *createInstance(Context &context) const;
131 
132 private:
133     const tcu::IVec3 m_localSize;
134     const tcu::IVec3 m_workSize;
135     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
136 };
137 
138 class SharedVarTestInstance : public vkt::TestInstance
139 {
140 public:
141     SharedVarTestInstance(Context &context, const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
142                           const vk::ComputePipelineConstructionType computePipelineConstructionType);
143 
144     tcu::TestStatus iterate(void);
145 
146 private:
147     const tcu::IVec3 m_localSize;
148     const tcu::IVec3 m_workSize;
149     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
150 };
151 
SharedVarTest(tcu::TestContext & testCtx,const std::string & name,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)152 SharedVarTest::SharedVarTest(tcu::TestContext &testCtx, const std::string &name, const tcu::IVec3 &localSize,
153                              const tcu::IVec3 &workSize,
154                              const vk::ComputePipelineConstructionType computePipelineConstructionType)
155     : TestCase(testCtx, name)
156     , m_localSize(localSize)
157     , m_workSize(workSize)
158     , m_computePipelineConstructionType(computePipelineConstructionType)
159 {
160 }
161 
checkSupport(Context & context) const162 void SharedVarTest::checkSupport(Context &context) const
163 {
164     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
165                                   m_computePipelineConstructionType);
166 }
167 
initPrograms(SourceCollections & sourceCollections) const168 void SharedVarTest::initPrograms(SourceCollections &sourceCollections) const
169 {
170     const int workGroupSize  = multiplyComponents(m_localSize);
171     const int workGroupCount = multiplyComponents(m_workSize);
172     const int numValues      = workGroupSize * workGroupCount;
173 
174     std::ostringstream src;
175     src << "#version 310 es\n"
176         << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y()
177         << ", local_size_z = " << m_localSize.z() << ") in;\n"
178         << "layout(binding = 0) writeonly buffer Output {\n"
179         << "    uint values[" << numValues << "];\n"
180         << "} sb_out;\n\n"
181         << "shared uint offsets[" << workGroupSize << "];\n\n"
182         << "void main (void) {\n"
183         << "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
184         << "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + "
185            "gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
186         << "    uint globalOffs = localSize*globalNdx;\n"
187         << "    uint localOffs  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_LocalInvocationID.z + "
188            "gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x;\n"
189         << "\n"
190         << "    offsets[localSize-localOffs-1u] = globalOffs + localOffs*localOffs;\n"
191         << "    memoryBarrierShared();\n"
192         << "    barrier();\n"
193         << "    sb_out.values[globalOffs + localOffs] = offsets[localOffs];\n"
194         << "}\n";
195 
196     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
197 }
198 
createInstance(Context & context) const199 TestInstance *SharedVarTest::createInstance(Context &context) const
200 {
201     return new SharedVarTestInstance(context, m_localSize, m_workSize, m_computePipelineConstructionType);
202 }
203 
SharedVarTestInstance(Context & context,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)204 SharedVarTestInstance::SharedVarTestInstance(Context &context, const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
205                                              const vk::ComputePipelineConstructionType computePipelineConstructionType)
206     : TestInstance(context)
207     , m_localSize(localSize)
208     , m_workSize(workSize)
209     , m_computePipelineConstructionType(computePipelineConstructionType)
210 {
211 }
212 
iterate(void)213 tcu::TestStatus SharedVarTestInstance::iterate(void)
214 {
215     const DeviceInterface &vk       = m_context.getDeviceInterface();
216     const VkDevice device           = m_context.getDevice();
217     const VkQueue queue             = m_context.getUniversalQueue();
218     const uint32_t queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
219     Allocator &allocator            = m_context.getDefaultAllocator();
220 
221     const int workGroupSize  = multiplyComponents(m_localSize);
222     const int workGroupCount = multiplyComponents(m_workSize);
223 
224     // Create a buffer and host-visible memory for it
225 
226     const VkDeviceSize bufferSizeBytes = sizeof(uint32_t) * workGroupSize * workGroupCount;
227     const BufferWithMemory buffer(vk, device, allocator,
228                                   makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
229                                   MemoryRequirement::HostVisible);
230 
231     // Create descriptor set
232 
233     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
234         DescriptorSetLayoutBuilder()
235             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
236             .build(vk, device));
237 
238     const Unique<VkDescriptorPool> descriptorPool(
239         DescriptorPoolBuilder()
240             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
241             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
242 
243     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
244 
245     const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
246     DescriptorSetUpdateBuilder()
247         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
248                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
249         .update(vk, device);
250 
251     // Perform the computation
252 
253     ComputePipelineWrapper pipeline(vk, device, m_computePipelineConstructionType,
254                                     m_context.getBinaryCollection().get("comp"));
255     pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
256     pipeline.buildPipeline();
257 
258     const VkBufferMemoryBarrier computeFinishBarrier =
259         makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
260 
261     const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
262     const Unique<VkCommandBuffer> cmdBuffer(
263         allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
264 
265     // Start recording commands
266 
267     beginCommandBuffer(vk, *cmdBuffer);
268 
269     pipeline.bind(*cmdBuffer);
270     vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
271                              &descriptorSet.get(), 0u, nullptr);
272 
273     vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
274 
275     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
276                           (VkDependencyFlags)0, 0, nullptr, 1, &computeFinishBarrier, 0, nullptr);
277 
278     endCommandBuffer(vk, *cmdBuffer);
279 
280     // Wait for completion
281 
282     submitCommandsAndWait(vk, device, queue, *cmdBuffer);
283 
284     // Validate the results
285 
286     const Allocation &bufferAllocation = buffer.getAllocation();
287     invalidateAlloc(vk, device, bufferAllocation);
288 
289     const uint32_t *bufferPtr = static_cast<uint32_t *>(bufferAllocation.getHostPtr());
290 
291     for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
292     {
293         const int globalOffset = groupNdx * workGroupSize;
294         for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
295         {
296             const uint32_t res = bufferPtr[globalOffset + localOffset];
297             const uint32_t ref = globalOffset + squared(workGroupSize - localOffset - 1);
298 
299             if (res != ref)
300             {
301                 std::ostringstream msg;
302                 msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
303                 return tcu::TestStatus::fail(msg.str());
304             }
305         }
306     }
307     return tcu::TestStatus::pass("Compute succeeded");
308 }
309 
310 class SharedVarAtomicOpTest : public vkt::TestCase
311 {
312 public:
313     SharedVarAtomicOpTest(tcu::TestContext &testCtx, const std::string &name, const tcu::IVec3 &localSize,
314                           const tcu::IVec3 &workSize,
315                           const vk::ComputePipelineConstructionType computePipelineConstructionType);
316 
317     virtual void checkSupport(Context &context) const;
318     void initPrograms(SourceCollections &sourceCollections) const;
319     TestInstance *createInstance(Context &context) const;
320 
321 private:
322     const tcu::IVec3 m_localSize;
323     const tcu::IVec3 m_workSize;
324     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
325 };
326 
327 class SharedVarAtomicOpTestInstance : public vkt::TestInstance
328 {
329 public:
330     SharedVarAtomicOpTestInstance(Context &context, const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
331                                   const vk::ComputePipelineConstructionType computePipelineConstructionType);
332 
333     tcu::TestStatus iterate(void);
334 
335 private:
336     const tcu::IVec3 m_localSize;
337     const tcu::IVec3 m_workSize;
338     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
339 };
340 
SharedVarAtomicOpTest(tcu::TestContext & testCtx,const std::string & name,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)341 SharedVarAtomicOpTest::SharedVarAtomicOpTest(tcu::TestContext &testCtx, const std::string &name,
342                                              const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
343                                              const vk::ComputePipelineConstructionType computePipelineConstructionType)
344     : TestCase(testCtx, name)
345     , m_localSize(localSize)
346     , m_workSize(workSize)
347     , m_computePipelineConstructionType(computePipelineConstructionType)
348 {
349 }
350 
checkSupport(Context & context) const351 void SharedVarAtomicOpTest::checkSupport(Context &context) const
352 {
353     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
354                                   m_computePipelineConstructionType);
355 }
356 
initPrograms(SourceCollections & sourceCollections) const357 void SharedVarAtomicOpTest::initPrograms(SourceCollections &sourceCollections) const
358 {
359     const int workGroupSize  = multiplyComponents(m_localSize);
360     const int workGroupCount = multiplyComponents(m_workSize);
361     const int numValues      = workGroupSize * workGroupCount;
362 
363     std::ostringstream src;
364     src << "#version 310 es\n"
365         << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y()
366         << ", local_size_z = " << m_localSize.z() << ") in;\n"
367         << "layout(binding = 0) writeonly buffer Output {\n"
368         << "    uint values[" << numValues << "];\n"
369         << "} sb_out;\n\n"
370         << "shared uint count;\n\n"
371         << "void main (void) {\n"
372         << "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
373         << "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + "
374            "gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
375         << "    uint globalOffs = localSize*globalNdx;\n"
376         << "\n"
377         << "    count = 0u;\n"
378         << "    memoryBarrierShared();\n"
379         << "    barrier();\n"
380         << "    uint oldVal = atomicAdd(count, 1u);\n"
381         << "    sb_out.values[globalOffs+oldVal] = oldVal+1u;\n"
382         << "}\n";
383 
384     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
385 }
386 
createInstance(Context & context) const387 TestInstance *SharedVarAtomicOpTest::createInstance(Context &context) const
388 {
389     return new SharedVarAtomicOpTestInstance(context, m_localSize, m_workSize, m_computePipelineConstructionType);
390 }
391 
SharedVarAtomicOpTestInstance(Context & context,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)392 SharedVarAtomicOpTestInstance::SharedVarAtomicOpTestInstance(
393     Context &context, const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
394     const vk::ComputePipelineConstructionType computePipelineConstructionType)
395     : TestInstance(context)
396     , m_localSize(localSize)
397     , m_workSize(workSize)
398     , m_computePipelineConstructionType(computePipelineConstructionType)
399 {
400 }
401 
iterate(void)402 tcu::TestStatus SharedVarAtomicOpTestInstance::iterate(void)
403 {
404     const DeviceInterface &vk       = m_context.getDeviceInterface();
405     const VkDevice device           = m_context.getDevice();
406     const VkQueue queue             = m_context.getUniversalQueue();
407     const uint32_t queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
408     Allocator &allocator            = m_context.getDefaultAllocator();
409 
410     const int workGroupSize  = multiplyComponents(m_localSize);
411     const int workGroupCount = multiplyComponents(m_workSize);
412 
413     // Create a buffer and host-visible memory for it
414 
415     const VkDeviceSize bufferSizeBytes = sizeof(uint32_t) * workGroupSize * workGroupCount;
416     const BufferWithMemory buffer(vk, device, allocator,
417                                   makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
418                                   MemoryRequirement::HostVisible);
419 
420     // Create descriptor set
421 
422     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
423         DescriptorSetLayoutBuilder()
424             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
425             .build(vk, device));
426 
427     const Unique<VkDescriptorPool> descriptorPool(
428         DescriptorPoolBuilder()
429             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
430             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
431 
432     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
433 
434     const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
435     DescriptorSetUpdateBuilder()
436         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
437                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
438         .update(vk, device);
439 
440     // Perform the computation
441 
442     ComputePipelineWrapper pipeline(vk, device, m_computePipelineConstructionType,
443                                     m_context.getBinaryCollection().get("comp"));
444     pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
445     pipeline.buildPipeline();
446 
447     const VkBufferMemoryBarrier computeFinishBarrier =
448         makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
449 
450     const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
451     const Unique<VkCommandBuffer> cmdBuffer(
452         allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
453 
454     // Start recording commands
455 
456     beginCommandBuffer(vk, *cmdBuffer);
457 
458     pipeline.bind(*cmdBuffer);
459     vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
460                              &descriptorSet.get(), 0u, nullptr);
461 
462     vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
463 
464     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
465                           (VkDependencyFlags)0, 0, nullptr, 1u, &computeFinishBarrier, 0, nullptr);
466 
467     endCommandBuffer(vk, *cmdBuffer);
468 
469     // Wait for completion
470 
471     submitCommandsAndWait(vk, device, queue, *cmdBuffer);
472 
473     // Validate the results
474 
475     const Allocation &bufferAllocation = buffer.getAllocation();
476     invalidateAlloc(vk, device, bufferAllocation);
477 
478     const uint32_t *bufferPtr = static_cast<uint32_t *>(bufferAllocation.getHostPtr());
479 
480     for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
481     {
482         const int globalOffset = groupNdx * workGroupSize;
483         for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
484         {
485             const uint32_t res = bufferPtr[globalOffset + localOffset];
486             const uint32_t ref = localOffset + 1;
487 
488             if (res != ref)
489             {
490                 std::ostringstream msg;
491                 msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
492                 return tcu::TestStatus::fail(msg.str());
493             }
494         }
495     }
496     return tcu::TestStatus::pass("Compute succeeded");
497 }
498 
499 class SSBOLocalBarrierTest : public vkt::TestCase
500 {
501 public:
502     SSBOLocalBarrierTest(tcu::TestContext &testCtx, const std::string &name, const tcu::IVec3 &localSize,
503                          const tcu::IVec3 &workSize,
504                          const vk::ComputePipelineConstructionType computePipelineConstructionType);
505 
506     virtual void checkSupport(Context &context) const;
507     void initPrograms(SourceCollections &sourceCollections) const;
508     TestInstance *createInstance(Context &context) const;
509 
510 private:
511     const tcu::IVec3 m_localSize;
512     const tcu::IVec3 m_workSize;
513     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
514 };
515 
516 class SSBOLocalBarrierTestInstance : public vkt::TestInstance
517 {
518 public:
519     SSBOLocalBarrierTestInstance(Context &context, const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
520                                  const vk::ComputePipelineConstructionType computePipelineConstructionType);
521 
522     tcu::TestStatus iterate(void);
523 
524 private:
525     const tcu::IVec3 m_localSize;
526     const tcu::IVec3 m_workSize;
527     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
528 };
529 
SSBOLocalBarrierTest(tcu::TestContext & testCtx,const std::string & name,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)530 SSBOLocalBarrierTest::SSBOLocalBarrierTest(tcu::TestContext &testCtx, const std::string &name,
531                                            const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
532                                            const vk::ComputePipelineConstructionType computePipelineConstructionType)
533     : TestCase(testCtx, name)
534     , m_localSize(localSize)
535     , m_workSize(workSize)
536     , m_computePipelineConstructionType(computePipelineConstructionType)
537 {
538 }
539 
checkSupport(Context & context) const540 void SSBOLocalBarrierTest::checkSupport(Context &context) const
541 {
542     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
543                                   m_computePipelineConstructionType);
544 }
545 
initPrograms(SourceCollections & sourceCollections) const546 void SSBOLocalBarrierTest::initPrograms(SourceCollections &sourceCollections) const
547 {
548     const int workGroupSize  = multiplyComponents(m_localSize);
549     const int workGroupCount = multiplyComponents(m_workSize);
550     const int numValues      = workGroupSize * workGroupCount;
551 
552     std::ostringstream src;
553     src << "#version 310 es\n"
554         << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y()
555         << ", local_size_z = " << m_localSize.z() << ") in;\n"
556         << "layout(binding = 0) coherent buffer Output {\n"
557         << "    uint values[" << numValues << "];\n"
558         << "} sb_out;\n\n"
559         << "void main (void) {\n"
560         << "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
561         << "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + "
562            "gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
563         << "    uint globalOffs = localSize*globalNdx;\n"
564         << "    uint localOffs  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_LocalInvocationID.z + "
565            "gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x;\n"
566         << "\n"
567         << "    sb_out.values[globalOffs + localOffs] = globalOffs;\n"
568         << "    memoryBarrierBuffer();\n"
569         << "    barrier();\n"
570         << "    sb_out.values[globalOffs + ((localOffs+1u)%localSize)] += localOffs;\n" // += so we read and write
571         << "    memoryBarrierBuffer();\n"
572         << "    barrier();\n"
573         << "    sb_out.values[globalOffs + ((localOffs+2u)%localSize)] += localOffs;\n"
574         << "}\n";
575 
576     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
577 }
578 
createInstance(Context & context) const579 TestInstance *SSBOLocalBarrierTest::createInstance(Context &context) const
580 {
581     return new SSBOLocalBarrierTestInstance(context, m_localSize, m_workSize, m_computePipelineConstructionType);
582 }
583 
SSBOLocalBarrierTestInstance(Context & context,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)584 SSBOLocalBarrierTestInstance::SSBOLocalBarrierTestInstance(
585     Context &context, const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
586     const vk::ComputePipelineConstructionType computePipelineConstructionType)
587     : TestInstance(context)
588     , m_localSize(localSize)
589     , m_workSize(workSize)
590     , m_computePipelineConstructionType(computePipelineConstructionType)
591 {
592 }
593 
iterate(void)594 tcu::TestStatus SSBOLocalBarrierTestInstance::iterate(void)
595 {
596     const DeviceInterface &vk       = m_context.getDeviceInterface();
597     const VkDevice device           = m_context.getDevice();
598     const VkQueue queue             = m_context.getUniversalQueue();
599     const uint32_t queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
600     Allocator &allocator            = m_context.getDefaultAllocator();
601 
602     const int workGroupSize  = multiplyComponents(m_localSize);
603     const int workGroupCount = multiplyComponents(m_workSize);
604 
605     // Create a buffer and host-visible memory for it
606 
607     const VkDeviceSize bufferSizeBytes = sizeof(uint32_t) * workGroupSize * workGroupCount;
608     const BufferWithMemory buffer(vk, device, allocator,
609                                   makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
610                                   MemoryRequirement::HostVisible);
611 
612     // Create descriptor set
613 
614     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
615         DescriptorSetLayoutBuilder()
616             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
617             .build(vk, device));
618 
619     const Unique<VkDescriptorPool> descriptorPool(
620         DescriptorPoolBuilder()
621             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
622             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
623 
624     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
625 
626     const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
627     DescriptorSetUpdateBuilder()
628         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
629                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
630         .update(vk, device);
631 
632     // Perform the computation
633 
634     ComputePipelineWrapper pipeline(vk, device, m_computePipelineConstructionType,
635                                     m_context.getBinaryCollection().get("comp"));
636     pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
637     pipeline.buildPipeline();
638 
639     const VkBufferMemoryBarrier computeFinishBarrier =
640         makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
641 
642     const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
643     const Unique<VkCommandBuffer> cmdBuffer(
644         allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
645 
646     // Start recording commands
647 
648     beginCommandBuffer(vk, *cmdBuffer);
649 
650     pipeline.bind(*cmdBuffer);
651     vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
652                              &descriptorSet.get(), 0u, nullptr);
653 
654     vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
655 
656     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
657                           (VkDependencyFlags)0, 0, nullptr, 1, &computeFinishBarrier, 0, nullptr);
658 
659     endCommandBuffer(vk, *cmdBuffer);
660 
661     // Wait for completion
662 
663     submitCommandsAndWait(vk, device, queue, *cmdBuffer);
664 
665     // Validate the results
666 
667     const Allocation &bufferAllocation = buffer.getAllocation();
668     invalidateAlloc(vk, device, bufferAllocation);
669 
670     const uint32_t *bufferPtr = static_cast<uint32_t *>(bufferAllocation.getHostPtr());
671 
672     for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
673     {
674         const int globalOffset = groupNdx * workGroupSize;
675         for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
676         {
677             const uint32_t res = bufferPtr[globalOffset + localOffset];
678             const int offs0    = localOffset - 1 < 0 ? ((localOffset + workGroupSize - 1) % workGroupSize) :
679                                                        ((localOffset - 1) % workGroupSize);
680             const int offs1    = localOffset - 2 < 0 ? ((localOffset + workGroupSize - 2) % workGroupSize) :
681                                                        ((localOffset - 2) % workGroupSize);
682             const uint32_t ref = static_cast<uint32_t>(globalOffset + offs0 + offs1);
683 
684             if (res != ref)
685             {
686                 std::ostringstream msg;
687                 msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
688                 return tcu::TestStatus::fail(msg.str());
689             }
690         }
691     }
692     return tcu::TestStatus::pass("Compute succeeded");
693 }
694 
695 class CopyImageToSSBOTest : public vkt::TestCase
696 {
697 public:
698     CopyImageToSSBOTest(tcu::TestContext &testCtx, const std::string &name, const tcu::IVec2 &localSize,
699                         const tcu::IVec2 &imageSize,
700                         const vk::ComputePipelineConstructionType computePipelineConstructionType);
701 
702     virtual void checkSupport(Context &context) const;
703     void initPrograms(SourceCollections &sourceCollections) const;
704     TestInstance *createInstance(Context &context) const;
705 
706 private:
707     const tcu::IVec2 m_localSize;
708     const tcu::IVec2 m_imageSize;
709     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
710 };
711 
712 class CopyImageToSSBOTestInstance : public vkt::TestInstance
713 {
714 public:
715     CopyImageToSSBOTestInstance(Context &context, const tcu::IVec2 &localSize, const tcu::IVec2 &imageSize,
716                                 const vk::ComputePipelineConstructionType computePipelineConstructionType);
717 
718     tcu::TestStatus iterate(void);
719 
720 private:
721     const tcu::IVec2 m_localSize;
722     const tcu::IVec2 m_imageSize;
723     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
724 };
725 
CopyImageToSSBOTest(tcu::TestContext & testCtx,const std::string & name,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)726 CopyImageToSSBOTest::CopyImageToSSBOTest(tcu::TestContext &testCtx, const std::string &name,
727                                          const tcu::IVec2 &localSize, const tcu::IVec2 &imageSize,
728                                          const vk::ComputePipelineConstructionType computePipelineConstructionType)
729     : TestCase(testCtx, name)
730     , m_localSize(localSize)
731     , m_imageSize(imageSize)
732     , m_computePipelineConstructionType(computePipelineConstructionType)
733 {
734     DE_ASSERT(m_imageSize.x() % m_localSize.x() == 0);
735     DE_ASSERT(m_imageSize.y() % m_localSize.y() == 0);
736 }
737 
checkSupport(Context & context) const738 void CopyImageToSSBOTest::checkSupport(Context &context) const
739 {
740     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
741                                   m_computePipelineConstructionType);
742 }
743 
initPrograms(SourceCollections & sourceCollections) const744 void CopyImageToSSBOTest::initPrograms(SourceCollections &sourceCollections) const
745 {
746     std::ostringstream src;
747     src << "#version 310 es\n"
748         << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ") in;\n"
749         << "layout(binding = 1, r32ui) readonly uniform highp uimage2D u_srcImg;\n"
750         << "layout(binding = 0) writeonly buffer Output {\n"
751         << "    uint values[" << (m_imageSize.x() * m_imageSize.y()) << "];\n"
752         << "} sb_out;\n\n"
753         << "void main (void) {\n"
754         << "    uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
755         << "    uint value  = imageLoad(u_srcImg, ivec2(gl_GlobalInvocationID.xy)).x;\n"
756         << "    sb_out.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x] = value;\n"
757         << "}\n";
758 
759     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
760 }
761 
createInstance(Context & context) const762 TestInstance *CopyImageToSSBOTest::createInstance(Context &context) const
763 {
764     return new CopyImageToSSBOTestInstance(context, m_localSize, m_imageSize, m_computePipelineConstructionType);
765 }
766 
CopyImageToSSBOTestInstance(Context & context,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)767 CopyImageToSSBOTestInstance::CopyImageToSSBOTestInstance(
768     Context &context, const tcu::IVec2 &localSize, const tcu::IVec2 &imageSize,
769     const vk::ComputePipelineConstructionType computePipelineConstructionType)
770     : TestInstance(context)
771     , m_localSize(localSize)
772     , m_imageSize(imageSize)
773     , m_computePipelineConstructionType(computePipelineConstructionType)
774 {
775 }
776 
iterate(void)777 tcu::TestStatus CopyImageToSSBOTestInstance::iterate(void)
778 {
779     const DeviceInterface &vk       = m_context.getDeviceInterface();
780     const VkDevice device           = m_context.getDevice();
781     const VkQueue queue             = m_context.getUniversalQueue();
782     const uint32_t queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
783     Allocator &allocator            = m_context.getDefaultAllocator();
784 
785     // Create an image
786 
787     const VkImageCreateInfo imageParams =
788         make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
789     const ImageWithMemory image(vk, device, allocator, imageParams, MemoryRequirement::Any);
790 
791     const VkImageSubresourceRange subresourceRange =
792         makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
793     const Unique<VkImageView> imageView(
794         makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
795 
796     // Staging buffer (source data for image)
797 
798     const uint32_t imageArea           = multiplyComponents(m_imageSize);
799     const VkDeviceSize bufferSizeBytes = sizeof(uint32_t) * imageArea;
800 
801     const BufferWithMemory stagingBuffer(vk, device, allocator,
802                                          makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
803                                          MemoryRequirement::HostVisible);
804 
805     // Populate the staging buffer with test data
806     {
807         de::Random rnd(0xab2c7);
808         const Allocation &stagingBufferAllocation = stagingBuffer.getAllocation();
809         uint32_t *bufferPtr                       = static_cast<uint32_t *>(stagingBufferAllocation.getHostPtr());
810         for (uint32_t i = 0; i < imageArea; ++i)
811             *bufferPtr++ = rnd.getUint32();
812 
813         flushAlloc(vk, device, stagingBufferAllocation);
814     }
815 
816     // Create a buffer to store shader output
817 
818     const BufferWithMemory outputBuffer(vk, device, allocator,
819                                         makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
820                                         MemoryRequirement::HostVisible);
821 
822     // Create descriptor set
823 
824     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
825         DescriptorSetLayoutBuilder()
826             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
827             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
828             .build(vk, device));
829 
830     const Unique<VkDescriptorPool> descriptorPool(
831         DescriptorPoolBuilder()
832             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
833             .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
834             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
835 
836     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
837 
838     // Set the bindings
839 
840     const VkDescriptorImageInfo imageDescriptorInfo =
841         makeDescriptorImageInfo(VK_NULL_HANDLE, *imageView, VK_IMAGE_LAYOUT_GENERAL);
842     const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, bufferSizeBytes);
843 
844     DescriptorSetUpdateBuilder()
845         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
846                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
847         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u),
848                      VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
849         .update(vk, device);
850 
851     // Perform the computation
852     {
853         ComputePipelineWrapper pipeline(vk, device, m_computePipelineConstructionType,
854                                         m_context.getBinaryCollection().get("comp"));
855         pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
856         pipeline.buildPipeline();
857 
858         const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(
859             VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, bufferSizeBytes);
860         const tcu::IVec2 workSize = m_imageSize / m_localSize;
861 
862         // Prepare the command buffer
863 
864         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
865         const Unique<VkCommandBuffer> cmdBuffer(
866             allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
867 
868         // Start recording commands
869 
870         beginCommandBuffer(vk, *cmdBuffer);
871 
872         pipeline.bind(*cmdBuffer);
873         vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
874                                  &descriptorSet.get(), 0u, nullptr);
875 
876         const std::vector<VkBufferImageCopy> bufferImageCopy(1, makeBufferImageCopy(m_imageSize));
877         copyBufferToImage(vk, *cmdBuffer, *stagingBuffer, bufferSizeBytes, bufferImageCopy, VK_IMAGE_ASPECT_COLOR_BIT,
878                           1, 1, *image, VK_IMAGE_LAYOUT_GENERAL, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
879 
880         vk.cmdDispatch(*cmdBuffer, workSize.x(), workSize.y(), 1u);
881         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
882                               (VkDependencyFlags)0, 0, nullptr, 1, &computeFinishBarrier, 0, nullptr);
883 
884         endCommandBuffer(vk, *cmdBuffer);
885 
886         // Wait for completion
887 
888         submitCommandsAndWait(vk, device, queue, *cmdBuffer);
889     }
890 
891     // Validate the results
892 
893     const Allocation &outputBufferAllocation = outputBuffer.getAllocation();
894     invalidateAlloc(vk, device, outputBufferAllocation);
895 
896     const uint32_t *bufferPtr    = static_cast<uint32_t *>(outputBufferAllocation.getHostPtr());
897     const uint32_t *refBufferPtr = static_cast<uint32_t *>(stagingBuffer.getAllocation().getHostPtr());
898 
899     for (uint32_t ndx = 0; ndx < imageArea; ++ndx)
900     {
901         const uint32_t res = *(bufferPtr + ndx);
902         const uint32_t ref = *(refBufferPtr + ndx);
903 
904         if (res != ref)
905         {
906             std::ostringstream msg;
907             msg << "Comparison failed for Output.values[" << ndx << "]";
908             return tcu::TestStatus::fail(msg.str());
909         }
910     }
911     return tcu::TestStatus::pass("Compute succeeded");
912 }
913 
914 class CopySSBOToImageTest : public vkt::TestCase
915 {
916 public:
917     CopySSBOToImageTest(tcu::TestContext &testCtx, const std::string &name, const tcu::IVec2 &localSize,
918                         const tcu::IVec2 &imageSize,
919                         const vk::ComputePipelineConstructionType computePipelineConstructionType);
920 
921     virtual void checkSupport(Context &context) const;
922     void initPrograms(SourceCollections &sourceCollections) const;
923     TestInstance *createInstance(Context &context) const;
924 
925 private:
926     const tcu::IVec2 m_localSize;
927     const tcu::IVec2 m_imageSize;
928     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
929 };
930 
931 class CopySSBOToImageTestInstance : public vkt::TestInstance
932 {
933 public:
934     CopySSBOToImageTestInstance(Context &context, const tcu::IVec2 &localSize, const tcu::IVec2 &imageSize,
935                                 const vk::ComputePipelineConstructionType computePipelineConstructionType);
936 
937     tcu::TestStatus iterate(void);
938 
939 private:
940     const tcu::IVec2 m_localSize;
941     const tcu::IVec2 m_imageSize;
942     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
943 };
944 
CopySSBOToImageTest(tcu::TestContext & testCtx,const std::string & name,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)945 CopySSBOToImageTest::CopySSBOToImageTest(tcu::TestContext &testCtx, const std::string &name,
946                                          const tcu::IVec2 &localSize, const tcu::IVec2 &imageSize,
947                                          const vk::ComputePipelineConstructionType computePipelineConstructionType)
948     : TestCase(testCtx, name)
949     , m_localSize(localSize)
950     , m_imageSize(imageSize)
951     , m_computePipelineConstructionType(computePipelineConstructionType)
952 {
953     DE_ASSERT(m_imageSize.x() % m_localSize.x() == 0);
954     DE_ASSERT(m_imageSize.y() % m_localSize.y() == 0);
955 }
956 
checkSupport(Context & context) const957 void CopySSBOToImageTest::checkSupport(Context &context) const
958 {
959     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
960                                   m_computePipelineConstructionType);
961 }
962 
initPrograms(SourceCollections & sourceCollections) const963 void CopySSBOToImageTest::initPrograms(SourceCollections &sourceCollections) const
964 {
965     std::ostringstream src;
966     src << "#version 310 es\n"
967         << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ") in;\n"
968         << "layout(binding = 1, r32ui) writeonly uniform highp uimage2D u_dstImg;\n"
969         << "layout(binding = 0) readonly buffer Input {\n"
970         << "    uint values[" << (m_imageSize.x() * m_imageSize.y()) << "];\n"
971         << "} sb_in;\n\n"
972         << "void main (void) {\n"
973         << "    uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
974         << "    uint value  = sb_in.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x];\n"
975         << "    imageStore(u_dstImg, ivec2(gl_GlobalInvocationID.xy), uvec4(value, 0, 0, 0));\n"
976         << "}\n";
977 
978     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
979 }
980 
createInstance(Context & context) const981 TestInstance *CopySSBOToImageTest::createInstance(Context &context) const
982 {
983     return new CopySSBOToImageTestInstance(context, m_localSize, m_imageSize, m_computePipelineConstructionType);
984 }
985 
CopySSBOToImageTestInstance(Context & context,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)986 CopySSBOToImageTestInstance::CopySSBOToImageTestInstance(
987     Context &context, const tcu::IVec2 &localSize, const tcu::IVec2 &imageSize,
988     const vk::ComputePipelineConstructionType computePipelineConstructionType)
989     : TestInstance(context)
990     , m_localSize(localSize)
991     , m_imageSize(imageSize)
992     , m_computePipelineConstructionType(computePipelineConstructionType)
993 {
994 }
995 
iterate(void)996 tcu::TestStatus CopySSBOToImageTestInstance::iterate(void)
997 {
998     ContextCommonData data     = m_context.getContextCommonData();
999     const DeviceInterface &vkd = data.vkd;
1000 
1001     // Create an image, a view, and the output buffer
1002     const VkImageSubresourceRange subresourceRange =
1003         makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
1004     ImageWithBuffer imageWithBuffer(
1005         vkd, data.device, data.allocator, vk::makeExtent3D(m_imageSize.x(), m_imageSize.y(), 1), VK_FORMAT_R32_UINT,
1006         VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_STORAGE_BIT, vk::VK_IMAGE_TYPE_2D, subresourceRange);
1007 
1008     const uint32_t imageArea           = multiplyComponents(m_imageSize);
1009     const VkDeviceSize bufferSizeBytes = sizeof(uint32_t) * imageArea;
1010 
1011     const BufferWithMemory inputBuffer(vkd, data.device, data.allocator,
1012                                        makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
1013                                        MemoryRequirement::HostVisible);
1014 
1015     // Populate the buffer with test data
1016     {
1017         de::Random rnd(0x77238ac2);
1018         const Allocation &inputBufferAllocation = inputBuffer.getAllocation();
1019         uint32_t *bufferPtr                     = static_cast<uint32_t *>(inputBufferAllocation.getHostPtr());
1020         for (uint32_t i = 0; i < imageArea; ++i)
1021             *bufferPtr++ = rnd.getUint32();
1022 
1023         flushAlloc(vkd, data.device, inputBufferAllocation);
1024     }
1025 
1026     // Create descriptor set
1027     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1028         DescriptorSetLayoutBuilder()
1029             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1030             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
1031             .build(vkd, data.device));
1032 
1033     const Unique<VkDescriptorPool> descriptorPool(
1034         DescriptorPoolBuilder()
1035             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
1036             .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
1037             .build(vkd, data.device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1038 
1039     const Unique<VkDescriptorSet> descriptorSet(
1040         makeDescriptorSet(vkd, data.device, *descriptorPool, *descriptorSetLayout));
1041 
1042     // Set the bindings
1043 
1044     const VkDescriptorImageInfo imageDescriptorInfo =
1045         makeDescriptorImageInfo(VK_NULL_HANDLE, imageWithBuffer.getImageView(), VK_IMAGE_LAYOUT_GENERAL);
1046     const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, bufferSizeBytes);
1047 
1048     DescriptorSetUpdateBuilder()
1049         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
1050                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
1051         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u),
1052                      VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
1053         .update(vkd, data.device);
1054 
1055     // Perform the computation
1056     {
1057         ComputePipelineWrapper pipeline(vkd, data.device, m_computePipelineConstructionType,
1058                                         m_context.getBinaryCollection().get("comp"));
1059         pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
1060         pipeline.buildPipeline();
1061 
1062         const VkBufferMemoryBarrier inputBufferPostHostWriteBarrier = makeBufferMemoryBarrier(
1063             VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, bufferSizeBytes);
1064 
1065         const VkImageMemoryBarrier imageLayoutBarrier =
1066             makeImageMemoryBarrier(0u, VK_ACCESS_SHADER_WRITE_BIT, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
1067                                    imageWithBuffer.getImage(), subresourceRange);
1068 
1069         const tcu::IVec2 workSize = m_imageSize / m_localSize;
1070 
1071         // Prepare the command buffer
1072 
1073         const Unique<VkCommandPool> cmdPool(makeCommandPool(vkd, data.device, data.qfIndex));
1074         const Unique<VkCommandBuffer> cmdBuffer(
1075             allocateCommandBuffer(vkd, data.device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1076 
1077         // Start recording commands
1078 
1079         beginCommandBuffer(vkd, *cmdBuffer);
1080 
1081         pipeline.bind(*cmdBuffer);
1082         vkd.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
1083                                   &descriptorSet.get(), 0u, nullptr);
1084 
1085         vkd.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
1086                                (VkDependencyFlags)0, 0, nullptr, 1, &inputBufferPostHostWriteBarrier, 1,
1087                                &imageLayoutBarrier);
1088         vkd.cmdDispatch(*cmdBuffer, workSize.x(), workSize.y(), 1u);
1089 
1090         copyImageToBuffer(vkd, *cmdBuffer, imageWithBuffer.getImage(), imageWithBuffer.getBuffer(), m_imageSize,
1091                           VK_ACCESS_SHADER_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL);
1092 
1093         endCommandBuffer(vkd, *cmdBuffer);
1094 
1095         // Wait for completion
1096 
1097         submitCommandsAndWait(vkd, data.device, data.queue, *cmdBuffer);
1098     }
1099 
1100     // Validate the results
1101 
1102     const Allocation &outputBufferAllocation = imageWithBuffer.getBufferAllocation();
1103     invalidateAlloc(vkd, data.device, outputBufferAllocation);
1104 
1105     const uint32_t *bufferPtr    = static_cast<uint32_t *>(outputBufferAllocation.getHostPtr());
1106     const uint32_t *refBufferPtr = static_cast<uint32_t *>(inputBuffer.getAllocation().getHostPtr());
1107 
1108     for (uint32_t ndx = 0; ndx < imageArea; ++ndx)
1109     {
1110         const uint32_t res = *(bufferPtr + ndx);
1111         const uint32_t ref = *(refBufferPtr + ndx);
1112 
1113         if (res != ref)
1114         {
1115             std::ostringstream msg;
1116             msg << "Comparison failed for pixel " << ndx;
1117             return tcu::TestStatus::fail(msg.str());
1118         }
1119     }
1120     return tcu::TestStatus::pass("Compute succeeded");
1121 }
1122 
1123 class BufferToBufferInvertTest : public vkt::TestCase
1124 {
1125 public:
1126     virtual void checkSupport(Context &context) const;
1127     void initPrograms(SourceCollections &sourceCollections) const;
1128     TestInstance *createInstance(Context &context) const;
1129 
1130     static BufferToBufferInvertTest *UBOToSSBOInvertCase(
1131         tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues, const tcu::IVec3 &localSize,
1132         const tcu::IVec3 &workSize, const vk::ComputePipelineConstructionType computePipelineConstructionType);
1133 
1134     static BufferToBufferInvertTest *CopyInvertSSBOCase(
1135         tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues, const tcu::IVec3 &localSize,
1136         const tcu::IVec3 &workSize, const vk::ComputePipelineConstructionType computePipelineConstructionType);
1137 
1138 private:
1139     BufferToBufferInvertTest(tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues,
1140                              const tcu::IVec3 &localSize, const tcu::IVec3 &workSize, const BufferType bufferType,
1141                              const vk::ComputePipelineConstructionType computePipelineConstructionType);
1142 
1143     const BufferType m_bufferType;
1144     const uint32_t m_numValues;
1145     const tcu::IVec3 m_localSize;
1146     const tcu::IVec3 m_workSize;
1147     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
1148 };
1149 
1150 class BufferToBufferInvertTestInstance : public vkt::TestInstance
1151 {
1152 public:
1153     BufferToBufferInvertTestInstance(Context &context, const uint32_t numValues, const tcu::IVec3 &localSize,
1154                                      const tcu::IVec3 &workSize, const BufferType bufferType,
1155                                      const vk::ComputePipelineConstructionType computePipelineConstructionType);
1156 
1157     tcu::TestStatus iterate(void);
1158 
1159 private:
1160     const BufferType m_bufferType;
1161     const uint32_t m_numValues;
1162     const tcu::IVec3 m_localSize;
1163     const tcu::IVec3 m_workSize;
1164     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
1165 };
1166 
BufferToBufferInvertTest(tcu::TestContext & testCtx,const std::string & name,const uint32_t numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const BufferType bufferType,const vk::ComputePipelineConstructionType computePipelineConstructionType)1167 BufferToBufferInvertTest::BufferToBufferInvertTest(
1168     tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues, const tcu::IVec3 &localSize,
1169     const tcu::IVec3 &workSize, const BufferType bufferType,
1170     const vk::ComputePipelineConstructionType computePipelineConstructionType)
1171     : TestCase(testCtx, name)
1172     , m_bufferType(bufferType)
1173     , m_numValues(numValues)
1174     , m_localSize(localSize)
1175     , m_workSize(workSize)
1176     , m_computePipelineConstructionType(computePipelineConstructionType)
1177 {
1178     DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1179     DE_ASSERT(m_bufferType == BUFFER_TYPE_UNIFORM || m_bufferType == BUFFER_TYPE_SSBO);
1180 }
1181 
UBOToSSBOInvertCase(tcu::TestContext & testCtx,const std::string & name,const uint32_t numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)1182 BufferToBufferInvertTest *BufferToBufferInvertTest::UBOToSSBOInvertCase(
1183     tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues, const tcu::IVec3 &localSize,
1184     const tcu::IVec3 &workSize, const vk::ComputePipelineConstructionType computePipelineConstructionType)
1185 {
1186     return new BufferToBufferInvertTest(testCtx, name, numValues, localSize, workSize, BUFFER_TYPE_UNIFORM,
1187                                         computePipelineConstructionType);
1188 }
1189 
CopyInvertSSBOCase(tcu::TestContext & testCtx,const std::string & name,const uint32_t numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)1190 BufferToBufferInvertTest *BufferToBufferInvertTest::CopyInvertSSBOCase(
1191     tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues, const tcu::IVec3 &localSize,
1192     const tcu::IVec3 &workSize, const vk::ComputePipelineConstructionType computePipelineConstructionType)
1193 {
1194     return new BufferToBufferInvertTest(testCtx, name, numValues, localSize, workSize, BUFFER_TYPE_SSBO,
1195                                         computePipelineConstructionType);
1196 }
1197 
checkSupport(Context & context) const1198 void BufferToBufferInvertTest::checkSupport(Context &context) const
1199 {
1200     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
1201                                   m_computePipelineConstructionType);
1202 }
1203 
initPrograms(SourceCollections & sourceCollections) const1204 void BufferToBufferInvertTest::initPrograms(SourceCollections &sourceCollections) const
1205 {
1206     std::ostringstream src;
1207     if (m_bufferType == BUFFER_TYPE_UNIFORM)
1208     {
1209         src << "#version 310 es\n"
1210             << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y()
1211             << ", local_size_z = " << m_localSize.z() << ") in;\n"
1212             << "layout(binding = 0) readonly uniform Input {\n"
1213             << "    uint values[" << m_numValues << "];\n"
1214             << "} ub_in;\n"
1215             << "layout(binding = 1, std140) writeonly buffer Output {\n"
1216             << "    uint values[" << m_numValues << "];\n"
1217             << "} sb_out;\n"
1218             << "void main (void) {\n"
1219             << "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1220             << "    uint numValuesPerInv = uint(ub_in.values.length()) / (size.x*size.y*size.z);\n"
1221             << "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + "
1222                "gl_GlobalInvocationID.x;\n"
1223             << "    uint offset          = numValuesPerInv*groupNdx;\n"
1224             << "\n"
1225             << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1226             << "        sb_out.values[offset + ndx] = ~ub_in.values[offset + ndx];\n"
1227             << "}\n";
1228     }
1229     else if (m_bufferType == BUFFER_TYPE_SSBO)
1230     {
1231         src << "#version 310 es\n"
1232             << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y()
1233             << ", local_size_z = " << m_localSize.z() << ") in;\n"
1234             << "layout(binding = 0, std140) readonly buffer Input {\n"
1235             << "    uint values[" << m_numValues << "];\n"
1236             << "} sb_in;\n"
1237             << "layout (binding = 1, std140) writeonly buffer Output {\n"
1238             << "    uint values[" << m_numValues << "];\n"
1239             << "} sb_out;\n"
1240             << "void main (void) {\n"
1241             << "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1242             << "    uint numValuesPerInv = uint(sb_in.values.length()) / (size.x*size.y*size.z);\n"
1243             << "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + "
1244                "gl_GlobalInvocationID.x;\n"
1245             << "    uint offset          = numValuesPerInv*groupNdx;\n"
1246             << "\n"
1247             << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1248             << "        sb_out.values[offset + ndx] = ~sb_in.values[offset + ndx];\n"
1249             << "}\n";
1250     }
1251 
1252     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1253 }
1254 
createInstance(Context & context) const1255 TestInstance *BufferToBufferInvertTest::createInstance(Context &context) const
1256 {
1257     return new BufferToBufferInvertTestInstance(context, m_numValues, m_localSize, m_workSize, m_bufferType,
1258                                                 m_computePipelineConstructionType);
1259 }
1260 
BufferToBufferInvertTestInstance(Context & context,const uint32_t numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const BufferType bufferType,const vk::ComputePipelineConstructionType computePipelineConstructionType)1261 BufferToBufferInvertTestInstance::BufferToBufferInvertTestInstance(
1262     Context &context, const uint32_t numValues, const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
1263     const BufferType bufferType, const vk::ComputePipelineConstructionType computePipelineConstructionType)
1264     : TestInstance(context)
1265     , m_bufferType(bufferType)
1266     , m_numValues(numValues)
1267     , m_localSize(localSize)
1268     , m_workSize(workSize)
1269     , m_computePipelineConstructionType(computePipelineConstructionType)
1270 {
1271 }
1272 
iterate(void)1273 tcu::TestStatus BufferToBufferInvertTestInstance::iterate(void)
1274 {
1275     const DeviceInterface &vk       = m_context.getDeviceInterface();
1276     const VkDevice device           = m_context.getDevice();
1277     const VkQueue queue             = m_context.getUniversalQueue();
1278     const uint32_t queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
1279     Allocator &allocator            = m_context.getDefaultAllocator();
1280 
1281     // Customize the test based on buffer type
1282 
1283     const VkBufferUsageFlags inputBufferUsageFlags =
1284         (m_bufferType == BUFFER_TYPE_UNIFORM ? VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT : VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
1285     const VkDescriptorType inputBufferDescriptorType =
1286         (m_bufferType == BUFFER_TYPE_UNIFORM ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1287     const uint32_t randomSeed = (m_bufferType == BUFFER_TYPE_UNIFORM ? 0x111223f : 0x124fef);
1288 
1289     // Create an input buffer
1290 
1291     const VkDeviceSize bufferSizeBytes = sizeof(tcu::UVec4) * m_numValues;
1292     const BufferWithMemory inputBuffer(vk, device, allocator,
1293                                        makeBufferCreateInfo(bufferSizeBytes, inputBufferUsageFlags),
1294                                        MemoryRequirement::HostVisible);
1295 
1296     // Fill the input buffer with data
1297     {
1298         de::Random rnd(randomSeed);
1299         const Allocation &inputBufferAllocation = inputBuffer.getAllocation();
1300         tcu::UVec4 *bufferPtr                   = static_cast<tcu::UVec4 *>(inputBufferAllocation.getHostPtr());
1301         for (uint32_t i = 0; i < m_numValues; ++i)
1302             bufferPtr[i].x() = rnd.getUint32();
1303 
1304         flushAlloc(vk, device, inputBufferAllocation);
1305     }
1306 
1307     // Create an output buffer
1308 
1309     const BufferWithMemory outputBuffer(vk, device, allocator,
1310                                         makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
1311                                         MemoryRequirement::HostVisible);
1312 
1313     // Create descriptor set
1314 
1315     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1316         DescriptorSetLayoutBuilder()
1317             .addSingleBinding(inputBufferDescriptorType, VK_SHADER_STAGE_COMPUTE_BIT)
1318             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1319             .build(vk, device));
1320 
1321     const Unique<VkDescriptorPool> descriptorPool(
1322         DescriptorPoolBuilder()
1323             .addType(inputBufferDescriptorType)
1324             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
1325             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1326 
1327     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1328 
1329     const VkDescriptorBufferInfo inputBufferDescriptorInfo =
1330         makeDescriptorBufferInfo(*inputBuffer, 0ull, bufferSizeBytes);
1331     const VkDescriptorBufferInfo outputBufferDescriptorInfo =
1332         makeDescriptorBufferInfo(*outputBuffer, 0ull, bufferSizeBytes);
1333     DescriptorSetUpdateBuilder()
1334         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), inputBufferDescriptorType,
1335                      &inputBufferDescriptorInfo)
1336         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u),
1337                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
1338         .update(vk, device);
1339 
1340     // Perform the computation
1341 
1342     ComputePipelineWrapper pipeline(vk, device, m_computePipelineConstructionType,
1343                                     m_context.getBinaryCollection().get("comp"));
1344     pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
1345     pipeline.buildPipeline();
1346 
1347     const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(
1348         VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, bufferSizeBytes);
1349 
1350     const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(
1351         VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, bufferSizeBytes);
1352 
1353     const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1354     const Unique<VkCommandBuffer> cmdBuffer(
1355         allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1356 
1357     // Start recording commands
1358 
1359     beginCommandBuffer(vk, *cmdBuffer);
1360 
1361     pipeline.bind(*cmdBuffer);
1362     vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
1363                              &descriptorSet.get(), 0u, nullptr);
1364 
1365     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
1366                           (VkDependencyFlags)0, 0, nullptr, 1, &hostWriteBarrier, 0, nullptr);
1367     vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1368     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
1369                           (VkDependencyFlags)0, 0, nullptr, 1, &shaderWriteBarrier, 0, nullptr);
1370 
1371     endCommandBuffer(vk, *cmdBuffer);
1372 
1373     // Wait for completion
1374 
1375     submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1376 
1377     // Validate the results
1378 
1379     const Allocation &outputBufferAllocation = outputBuffer.getAllocation();
1380     invalidateAlloc(vk, device, outputBufferAllocation);
1381 
1382     const tcu::UVec4 *bufferPtr    = static_cast<tcu::UVec4 *>(outputBufferAllocation.getHostPtr());
1383     const tcu::UVec4 *refBufferPtr = static_cast<tcu::UVec4 *>(inputBuffer.getAllocation().getHostPtr());
1384 
1385     for (uint32_t ndx = 0; ndx < m_numValues; ++ndx)
1386     {
1387         const uint32_t res = bufferPtr[ndx].x();
1388         const uint32_t ref = ~refBufferPtr[ndx].x();
1389 
1390         if (res != ref)
1391         {
1392             std::ostringstream msg;
1393             msg << "Comparison failed for Output.values[" << ndx << "]";
1394             return tcu::TestStatus::fail(msg.str());
1395         }
1396     }
1397     return tcu::TestStatus::pass("Compute succeeded");
1398 }
1399 
1400 class InvertSSBOInPlaceTest : public vkt::TestCase
1401 {
1402 public:
1403     InvertSSBOInPlaceTest(tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues,
1404                           const bool sized, const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
1405                           const vk::ComputePipelineConstructionType computePipelineConstructionType);
1406 
1407     virtual void checkSupport(Context &context) const;
1408     void initPrograms(SourceCollections &sourceCollections) const;
1409     TestInstance *createInstance(Context &context) const;
1410 
1411 private:
1412     const uint32_t m_numValues;
1413     const bool m_sized;
1414     const tcu::IVec3 m_localSize;
1415     const tcu::IVec3 m_workSize;
1416     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
1417 };
1418 
1419 class InvertSSBOInPlaceTestInstance : public vkt::TestInstance
1420 {
1421 public:
1422     InvertSSBOInPlaceTestInstance(Context &context, const uint32_t numValues, const tcu::IVec3 &localSize,
1423                                   const tcu::IVec3 &workSize,
1424                                   const vk::ComputePipelineConstructionType computePipelineConstructionType);
1425 
1426     tcu::TestStatus iterate(void);
1427 
1428 private:
1429     const uint32_t m_numValues;
1430     const tcu::IVec3 m_localSize;
1431     const tcu::IVec3 m_workSize;
1432     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
1433 };
1434 
InvertSSBOInPlaceTest(tcu::TestContext & testCtx,const std::string & name,const uint32_t numValues,const bool sized,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)1435 InvertSSBOInPlaceTest::InvertSSBOInPlaceTest(tcu::TestContext &testCtx, const std::string &name,
1436                                              const uint32_t numValues, const bool sized, const tcu::IVec3 &localSize,
1437                                              const tcu::IVec3 &workSize,
1438                                              const vk::ComputePipelineConstructionType computePipelineConstructionType)
1439     : TestCase(testCtx, name)
1440     , m_numValues(numValues)
1441     , m_sized(sized)
1442     , m_localSize(localSize)
1443     , m_workSize(workSize)
1444     , m_computePipelineConstructionType(computePipelineConstructionType)
1445 {
1446     DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1447 }
1448 
checkSupport(Context & context) const1449 void InvertSSBOInPlaceTest::checkSupport(Context &context) const
1450 {
1451     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
1452                                   m_computePipelineConstructionType);
1453 }
1454 
initPrograms(SourceCollections & sourceCollections) const1455 void InvertSSBOInPlaceTest::initPrograms(SourceCollections &sourceCollections) const
1456 {
1457     std::ostringstream src;
1458     src << "#version 310 es\n"
1459         << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y()
1460         << ", local_size_z = " << m_localSize.z() << ") in;\n"
1461         << "layout(binding = 0) buffer InOut {\n"
1462         << "    uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1463         << "} sb_inout;\n"
1464         << "void main (void) {\n"
1465         << "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1466         << "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
1467         << "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + "
1468            "gl_GlobalInvocationID.x;\n"
1469         << "    uint offset          = numValuesPerInv*groupNdx;\n"
1470         << "\n"
1471         << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1472         << "        sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
1473         << "}\n";
1474 
1475     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1476 }
1477 
createInstance(Context & context) const1478 TestInstance *InvertSSBOInPlaceTest::createInstance(Context &context) const
1479 {
1480     return new InvertSSBOInPlaceTestInstance(context, m_numValues, m_localSize, m_workSize,
1481                                              m_computePipelineConstructionType);
1482 }
1483 
InvertSSBOInPlaceTestInstance(Context & context,const uint32_t numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)1484 InvertSSBOInPlaceTestInstance::InvertSSBOInPlaceTestInstance(
1485     Context &context, const uint32_t numValues, const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
1486     const vk::ComputePipelineConstructionType computePipelineConstructionType)
1487     : TestInstance(context)
1488     , m_numValues(numValues)
1489     , m_localSize(localSize)
1490     , m_workSize(workSize)
1491     , m_computePipelineConstructionType(computePipelineConstructionType)
1492 {
1493 }
1494 
iterate(void)1495 tcu::TestStatus InvertSSBOInPlaceTestInstance::iterate(void)
1496 {
1497     const DeviceInterface &vk       = m_context.getDeviceInterface();
1498     const VkDevice device           = m_context.getDevice();
1499     const VkQueue queue             = m_context.getUniversalQueue();
1500     const uint32_t queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
1501     Allocator &allocator            = m_context.getDefaultAllocator();
1502 
1503     // Create an input/output buffer
1504 
1505     const VkDeviceSize bufferSizeBytes = sizeof(uint32_t) * m_numValues;
1506     const BufferWithMemory buffer(vk, device, allocator,
1507                                   makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
1508                                   MemoryRequirement::HostVisible);
1509 
1510     // Fill the buffer with data
1511 
1512     typedef std::vector<uint32_t> data_vector_t;
1513     data_vector_t inputData(m_numValues);
1514 
1515     {
1516         de::Random rnd(0x82ce7f);
1517         const Allocation &bufferAllocation = buffer.getAllocation();
1518         uint32_t *bufferPtr                = static_cast<uint32_t *>(bufferAllocation.getHostPtr());
1519         for (uint32_t i = 0; i < m_numValues; ++i)
1520             inputData[i] = *bufferPtr++ = rnd.getUint32();
1521 
1522         flushAlloc(vk, device, bufferAllocation);
1523     }
1524 
1525     // Create descriptor set
1526 
1527     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1528         DescriptorSetLayoutBuilder()
1529             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1530             .build(vk, device));
1531 
1532     const Unique<VkDescriptorPool> descriptorPool(
1533         DescriptorPoolBuilder()
1534             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
1535             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1536 
1537     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1538 
1539     const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
1540     DescriptorSetUpdateBuilder()
1541         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
1542                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
1543         .update(vk, device);
1544 
1545     // Perform the computation
1546 
1547     ComputePipelineWrapper pipeline(vk, device, m_computePipelineConstructionType,
1548                                     m_context.getBinaryCollection().get("comp"));
1549     pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
1550     pipeline.buildPipeline();
1551 
1552     const VkBufferMemoryBarrier hostWriteBarrier =
1553         makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer, 0ull, bufferSizeBytes);
1554 
1555     const VkBufferMemoryBarrier shaderWriteBarrier =
1556         makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
1557 
1558     const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1559     const Unique<VkCommandBuffer> cmdBuffer(
1560         allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1561 
1562     // Start recording commands
1563 
1564     beginCommandBuffer(vk, *cmdBuffer);
1565 
1566     pipeline.bind(*cmdBuffer);
1567     vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
1568                              &descriptorSet.get(), 0u, nullptr);
1569 
1570     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
1571                           (VkDependencyFlags)0, 0, nullptr, 1, &hostWriteBarrier, 0, nullptr);
1572     vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1573     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
1574                           (VkDependencyFlags)0, 0, nullptr, 1, &shaderWriteBarrier, 0, nullptr);
1575 
1576     endCommandBuffer(vk, *cmdBuffer);
1577 
1578     // Wait for completion
1579 
1580     submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1581 
1582     // Validate the results
1583 
1584     const Allocation &bufferAllocation = buffer.getAllocation();
1585     invalidateAlloc(vk, device, bufferAllocation);
1586 
1587     const uint32_t *bufferPtr = static_cast<uint32_t *>(bufferAllocation.getHostPtr());
1588 
1589     for (uint32_t ndx = 0; ndx < m_numValues; ++ndx)
1590     {
1591         const uint32_t res = bufferPtr[ndx];
1592         const uint32_t ref = ~inputData[ndx];
1593 
1594         if (res != ref)
1595         {
1596             std::ostringstream msg;
1597             msg << "Comparison failed for InOut.values[" << ndx << "]";
1598             return tcu::TestStatus::fail(msg.str());
1599         }
1600     }
1601     return tcu::TestStatus::pass("Compute succeeded");
1602 }
1603 
1604 class WriteToMultipleSSBOTest : public vkt::TestCase
1605 {
1606 public:
1607     WriteToMultipleSSBOTest(tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues,
1608                             const bool sized, const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
1609                             const vk::ComputePipelineConstructionType computePipelineConstructionType);
1610 
1611     virtual void checkSupport(Context &context) const;
1612     void initPrograms(SourceCollections &sourceCollections) const;
1613     TestInstance *createInstance(Context &context) const;
1614 
1615 private:
1616     const uint32_t m_numValues;
1617     const bool m_sized;
1618     const tcu::IVec3 m_localSize;
1619     const tcu::IVec3 m_workSize;
1620     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
1621 };
1622 
1623 class WriteToMultipleSSBOTestInstance : public vkt::TestInstance
1624 {
1625 public:
1626     WriteToMultipleSSBOTestInstance(Context &context, const uint32_t numValues, const tcu::IVec3 &localSize,
1627                                     const tcu::IVec3 &workSize,
1628                                     const vk::ComputePipelineConstructionType computePipelineConstructionType);
1629 
1630     tcu::TestStatus iterate(void);
1631 
1632 private:
1633     const uint32_t m_numValues;
1634     const tcu::IVec3 m_localSize;
1635     const tcu::IVec3 m_workSize;
1636     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
1637 };
1638 
WriteToMultipleSSBOTest(tcu::TestContext & testCtx,const std::string & name,const uint32_t numValues,const bool sized,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)1639 WriteToMultipleSSBOTest::WriteToMultipleSSBOTest(
1640     tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues, const bool sized,
1641     const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
1642     const vk::ComputePipelineConstructionType computePipelineConstructionType)
1643     : TestCase(testCtx, name)
1644     , m_numValues(numValues)
1645     , m_sized(sized)
1646     , m_localSize(localSize)
1647     , m_workSize(workSize)
1648     , m_computePipelineConstructionType(computePipelineConstructionType)
1649 {
1650     DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1651 }
1652 
checkSupport(Context & context) const1653 void WriteToMultipleSSBOTest::checkSupport(Context &context) const
1654 {
1655     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
1656                                   m_computePipelineConstructionType);
1657 }
1658 
initPrograms(SourceCollections & sourceCollections) const1659 void WriteToMultipleSSBOTest::initPrograms(SourceCollections &sourceCollections) const
1660 {
1661     std::ostringstream src;
1662     src << "#version 310 es\n"
1663         << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y()
1664         << ", local_size_z = " << m_localSize.z() << ") in;\n"
1665         << "layout(binding = 0) writeonly buffer Out0 {\n"
1666         << "    uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1667         << "} sb_out0;\n"
1668         << "layout(binding = 1) writeonly buffer Out1 {\n"
1669         << "    uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1670         << "} sb_out1;\n"
1671         << "void main (void) {\n"
1672         << "    uvec3 size      = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1673         << "    uint groupNdx   = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + "
1674            "gl_GlobalInvocationID.x;\n"
1675         << "\n"
1676         << "    {\n"
1677         << "        uint numValuesPerInv = uint(sb_out0.values.length()) / (size.x*size.y*size.z);\n"
1678         << "        uint offset          = numValuesPerInv*groupNdx;\n"
1679         << "\n"
1680         << "        for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1681         << "            sb_out0.values[offset + ndx] = offset + ndx;\n"
1682         << "    }\n"
1683         << "    {\n"
1684         << "        uint numValuesPerInv = uint(sb_out1.values.length()) / (size.x*size.y*size.z);\n"
1685         << "        uint offset          = numValuesPerInv*groupNdx;\n"
1686         << "\n"
1687         << "        for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1688         << "            sb_out1.values[offset + ndx] = uint(sb_out1.values.length()) - offset - ndx;\n"
1689         << "    }\n"
1690         << "}\n";
1691 
1692     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1693 }
1694 
createInstance(Context & context) const1695 TestInstance *WriteToMultipleSSBOTest::createInstance(Context &context) const
1696 {
1697     return new WriteToMultipleSSBOTestInstance(context, m_numValues, m_localSize, m_workSize,
1698                                                m_computePipelineConstructionType);
1699 }
1700 
WriteToMultipleSSBOTestInstance(Context & context,const uint32_t numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)1701 WriteToMultipleSSBOTestInstance::WriteToMultipleSSBOTestInstance(
1702     Context &context, const uint32_t numValues, const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
1703     const vk::ComputePipelineConstructionType computePipelineConstructionType)
1704     : TestInstance(context)
1705     , m_numValues(numValues)
1706     , m_localSize(localSize)
1707     , m_workSize(workSize)
1708     , m_computePipelineConstructionType(computePipelineConstructionType)
1709 {
1710 }
1711 
iterate(void)1712 tcu::TestStatus WriteToMultipleSSBOTestInstance::iterate(void)
1713 {
1714     const DeviceInterface &vk       = m_context.getDeviceInterface();
1715     const VkDevice device           = m_context.getDevice();
1716     const VkQueue queue             = m_context.getUniversalQueue();
1717     const uint32_t queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
1718     Allocator &allocator            = m_context.getDefaultAllocator();
1719 
1720     // Create two output buffers
1721 
1722     const VkDeviceSize bufferSizeBytes = sizeof(uint32_t) * m_numValues;
1723     const BufferWithMemory buffer0(vk, device, allocator,
1724                                    makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
1725                                    MemoryRequirement::HostVisible);
1726     const BufferWithMemory buffer1(vk, device, allocator,
1727                                    makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
1728                                    MemoryRequirement::HostVisible);
1729 
1730     // Create descriptor set
1731 
1732     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1733         DescriptorSetLayoutBuilder()
1734             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1735             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1736             .build(vk, device));
1737 
1738     const Unique<VkDescriptorPool> descriptorPool(
1739         DescriptorPoolBuilder()
1740             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2u)
1741             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1742 
1743     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1744 
1745     const VkDescriptorBufferInfo buffer0DescriptorInfo = makeDescriptorBufferInfo(*buffer0, 0ull, bufferSizeBytes);
1746     const VkDescriptorBufferInfo buffer1DescriptorInfo = makeDescriptorBufferInfo(*buffer1, 0ull, bufferSizeBytes);
1747     DescriptorSetUpdateBuilder()
1748         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
1749                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &buffer0DescriptorInfo)
1750         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u),
1751                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &buffer1DescriptorInfo)
1752         .update(vk, device);
1753 
1754     // Perform the computation
1755 
1756     ComputePipelineWrapper pipeline(vk, device, m_computePipelineConstructionType,
1757                                     m_context.getBinaryCollection().get("comp"));
1758     pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
1759     pipeline.buildPipeline();
1760 
1761     const VkBufferMemoryBarrier shaderWriteBarriers[] = {
1762         makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer0, 0ull, bufferSizeBytes),
1763         makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer1, 0ull, bufferSizeBytes)};
1764 
1765     const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1766     const Unique<VkCommandBuffer> cmdBuffer(
1767         allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1768 
1769     // Start recording commands
1770 
1771     beginCommandBuffer(vk, *cmdBuffer);
1772 
1773     pipeline.bind(*cmdBuffer);
1774     vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
1775                              &descriptorSet.get(), 0u, nullptr);
1776 
1777     vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1778     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
1779                           (VkDependencyFlags)0, 0, nullptr, DE_LENGTH_OF_ARRAY(shaderWriteBarriers),
1780                           shaderWriteBarriers, 0, nullptr);
1781 
1782     endCommandBuffer(vk, *cmdBuffer);
1783 
1784     // Wait for completion
1785 
1786     submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1787 
1788     // Validate the results
1789     {
1790         const Allocation &buffer0Allocation = buffer0.getAllocation();
1791         invalidateAlloc(vk, device, buffer0Allocation);
1792         const uint32_t *buffer0Ptr = static_cast<uint32_t *>(buffer0Allocation.getHostPtr());
1793 
1794         for (uint32_t ndx = 0; ndx < m_numValues; ++ndx)
1795         {
1796             const uint32_t res = buffer0Ptr[ndx];
1797             const uint32_t ref = ndx;
1798 
1799             if (res != ref)
1800             {
1801                 std::ostringstream msg;
1802                 msg << "Comparison failed for Out0.values[" << ndx << "] res=" << res << " ref=" << ref;
1803                 return tcu::TestStatus::fail(msg.str());
1804             }
1805         }
1806     }
1807     {
1808         const Allocation &buffer1Allocation = buffer1.getAllocation();
1809         invalidateAlloc(vk, device, buffer1Allocation);
1810         const uint32_t *buffer1Ptr = static_cast<uint32_t *>(buffer1Allocation.getHostPtr());
1811 
1812         for (uint32_t ndx = 0; ndx < m_numValues; ++ndx)
1813         {
1814             const uint32_t res = buffer1Ptr[ndx];
1815             const uint32_t ref = m_numValues - ndx;
1816 
1817             if (res != ref)
1818             {
1819                 std::ostringstream msg;
1820                 msg << "Comparison failed for Out1.values[" << ndx << "] res=" << res << " ref=" << ref;
1821                 return tcu::TestStatus::fail(msg.str());
1822             }
1823         }
1824     }
1825     return tcu::TestStatus::pass("Compute succeeded");
1826 }
1827 
1828 class SSBOBarrierTest : public vkt::TestCase
1829 {
1830 public:
1831     SSBOBarrierTest(tcu::TestContext &testCtx, const std::string &name, const tcu::IVec3 &workSize,
1832                     const vk::ComputePipelineConstructionType computePipelineConstructionType);
1833 
1834     virtual void checkSupport(Context &context) const;
1835     void initPrograms(SourceCollections &sourceCollections) const;
1836     TestInstance *createInstance(Context &context) const;
1837 
1838 private:
1839     const tcu::IVec3 m_workSize;
1840     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
1841 };
1842 
1843 class SSBOBarrierTestInstance : public vkt::TestInstance
1844 {
1845 public:
1846     SSBOBarrierTestInstance(Context &context, const tcu::IVec3 &workSize,
1847                             const vk::ComputePipelineConstructionType computePipelineConstructionType);
1848 
1849     tcu::TestStatus iterate(void);
1850 
1851 private:
1852     const tcu::IVec3 m_workSize;
1853     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
1854 };
1855 
SSBOBarrierTest(tcu::TestContext & testCtx,const std::string & name,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)1856 SSBOBarrierTest::SSBOBarrierTest(tcu::TestContext &testCtx, const std::string &name, const tcu::IVec3 &workSize,
1857                                  const vk::ComputePipelineConstructionType computePipelineConstructionType)
1858     : TestCase(testCtx, name)
1859     , m_workSize(workSize)
1860     , m_computePipelineConstructionType(computePipelineConstructionType)
1861 {
1862 }
1863 
checkSupport(Context & context) const1864 void SSBOBarrierTest::checkSupport(Context &context) const
1865 {
1866     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
1867                                   m_computePipelineConstructionType);
1868 }
1869 
initPrograms(SourceCollections & sourceCollections) const1870 void SSBOBarrierTest::initPrograms(SourceCollections &sourceCollections) const
1871 {
1872     sourceCollections.glslSources.add("comp0")
1873         << glu::ComputeSource("#version 310 es\n"
1874                               "layout (local_size_x = 1) in;\n"
1875                               "layout(binding = 2) readonly uniform Constants {\n"
1876                               "    uint u_baseVal;\n"
1877                               "};\n"
1878                               "layout(binding = 1) writeonly buffer Output {\n"
1879                               "    uint values[];\n"
1880                               "};\n"
1881                               "void main (void) {\n"
1882                               "    uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + "
1883                               "gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
1884                               "    values[offset] = u_baseVal + offset;\n"
1885                               "}\n");
1886 
1887     sourceCollections.glslSources.add("comp1")
1888         << glu::ComputeSource("#version 310 es\n"
1889                               "layout (local_size_x = 1) in;\n"
1890                               "layout(binding = 1) readonly buffer Input {\n"
1891                               "    uint values[];\n"
1892                               "};\n"
1893                               "layout(binding = 0) coherent buffer Output {\n"
1894                               "    uint sum;\n"
1895                               "};\n"
1896                               "void main (void) {\n"
1897                               "    uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + "
1898                               "gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
1899                               "    uint value  = values[offset];\n"
1900                               "    atomicAdd(sum, value);\n"
1901                               "}\n");
1902 }
1903 
createInstance(Context & context) const1904 TestInstance *SSBOBarrierTest::createInstance(Context &context) const
1905 {
1906     return new SSBOBarrierTestInstance(context, m_workSize, m_computePipelineConstructionType);
1907 }
1908 
SSBOBarrierTestInstance(Context & context,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)1909 SSBOBarrierTestInstance::SSBOBarrierTestInstance(
1910     Context &context, const tcu::IVec3 &workSize,
1911     const vk::ComputePipelineConstructionType computePipelineConstructionType)
1912     : TestInstance(context)
1913     , m_workSize(workSize)
1914     , m_computePipelineConstructionType(computePipelineConstructionType)
1915 {
1916 }
1917 
iterate(void)1918 tcu::TestStatus SSBOBarrierTestInstance::iterate(void)
1919 {
1920     const DeviceInterface &vk       = m_context.getDeviceInterface();
1921     const VkDevice device           = m_context.getDevice();
1922     const VkQueue queue             = m_context.getUniversalQueue();
1923     const uint32_t queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
1924     Allocator &allocator            = m_context.getDefaultAllocator();
1925 
1926     // Create a work buffer used by both shaders
1927 
1928     const int workGroupCount               = multiplyComponents(m_workSize);
1929     const VkDeviceSize workBufferSizeBytes = sizeof(uint32_t) * workGroupCount;
1930     const BufferWithMemory workBuffer(vk, device, allocator,
1931                                       makeBufferCreateInfo(workBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
1932                                       MemoryRequirement::Any);
1933 
1934     // Create an output buffer
1935 
1936     const VkDeviceSize outputBufferSizeBytes = sizeof(uint32_t);
1937     const BufferWithMemory outputBuffer(vk, device, allocator,
1938                                         makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
1939                                         MemoryRequirement::HostVisible);
1940 
1941     // Initialize atomic counter value to zero
1942     {
1943         const Allocation &outputBufferAllocation = outputBuffer.getAllocation();
1944         uint32_t *outputBufferPtr                = static_cast<uint32_t *>(outputBufferAllocation.getHostPtr());
1945         *outputBufferPtr                         = 0;
1946         flushAlloc(vk, device, outputBufferAllocation);
1947     }
1948 
1949     // Create a uniform buffer (to pass uniform constants)
1950 
1951     const VkDeviceSize uniformBufferSizeBytes = sizeof(uint32_t);
1952     const BufferWithMemory uniformBuffer(
1953         vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT),
1954         MemoryRequirement::HostVisible);
1955 
1956     // Set the constants in the uniform buffer
1957 
1958     const uint32_t baseValue = 127;
1959     {
1960         const Allocation &uniformBufferAllocation = uniformBuffer.getAllocation();
1961         uint32_t *uniformBufferPtr                = static_cast<uint32_t *>(uniformBufferAllocation.getHostPtr());
1962         uniformBufferPtr[0]                       = baseValue;
1963 
1964         flushAlloc(vk, device, uniformBufferAllocation);
1965     }
1966 
1967     // Create descriptor set
1968 
1969     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1970         DescriptorSetLayoutBuilder()
1971             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1972             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1973             .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1974             .build(vk, device));
1975 
1976     const Unique<VkDescriptorPool> descriptorPool(
1977         DescriptorPoolBuilder()
1978             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2u)
1979             .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
1980             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1981 
1982     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1983 
1984     const VkDescriptorBufferInfo workBufferDescriptorInfo =
1985         makeDescriptorBufferInfo(*workBuffer, 0ull, workBufferSizeBytes);
1986     const VkDescriptorBufferInfo outputBufferDescriptorInfo =
1987         makeDescriptorBufferInfo(*outputBuffer, 0ull, outputBufferSizeBytes);
1988     const VkDescriptorBufferInfo uniformBufferDescriptorInfo =
1989         makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
1990     DescriptorSetUpdateBuilder()
1991         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
1992                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
1993         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u),
1994                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &workBufferDescriptorInfo)
1995         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(2u),
1996                      VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
1997         .update(vk, device);
1998 
1999     // Perform the computation
2000 
2001     ComputePipelineWrapper pipeline0(vk, device, m_computePipelineConstructionType,
2002                                      m_context.getBinaryCollection().get("comp0"));
2003     pipeline0.setDescriptorSetLayout(descriptorSetLayout.get());
2004     pipeline0.buildPipeline();
2005 
2006     ComputePipelineWrapper pipeline1(vk, device, m_computePipelineConstructionType,
2007                                      m_context.getBinaryCollection().get("comp1"));
2008     pipeline1.setDescriptorSetLayout(descriptorSetLayout.get());
2009     pipeline1.buildPipeline();
2010 
2011     const VkBufferMemoryBarrier writeUniformConstantsBarrier = makeBufferMemoryBarrier(
2012         VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2013 
2014     const VkBufferMemoryBarrier betweenShadersBarrier = makeBufferMemoryBarrier(
2015         VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *workBuffer, 0ull, workBufferSizeBytes);
2016 
2017     const VkBufferMemoryBarrier afterComputeBarrier = makeBufferMemoryBarrier(
2018         VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, outputBufferSizeBytes);
2019 
2020     const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
2021     const Unique<VkCommandBuffer> cmdBuffer(
2022         allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2023 
2024     // Start recording commands
2025 
2026     beginCommandBuffer(vk, *cmdBuffer);
2027 
2028     pipeline0.bind(*cmdBuffer);
2029     vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline0.getPipelineLayout(), 0u, 1u,
2030                              &descriptorSet.get(), 0u, nullptr);
2031 
2032     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
2033                           (VkDependencyFlags)0, 0, nullptr, 1, &writeUniformConstantsBarrier, 0, nullptr);
2034 
2035     vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
2036     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
2037                           (VkDependencyFlags)0, 0, nullptr, 1, &betweenShadersBarrier, 0, nullptr);
2038 
2039     // Switch to the second shader program
2040     pipeline1.bind(*cmdBuffer);
2041 
2042     vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
2043     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
2044                           (VkDependencyFlags)0, 0, nullptr, 1, &afterComputeBarrier, 0, nullptr);
2045 
2046     endCommandBuffer(vk, *cmdBuffer);
2047 
2048     // Wait for completion
2049 
2050     submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2051 
2052     // Validate the results
2053 
2054     const Allocation &outputBufferAllocation = outputBuffer.getAllocation();
2055     invalidateAlloc(vk, device, outputBufferAllocation);
2056 
2057     const uint32_t *bufferPtr = static_cast<uint32_t *>(outputBufferAllocation.getHostPtr());
2058     const uint32_t res        = *bufferPtr;
2059     uint32_t ref              = 0;
2060 
2061     for (int ndx = 0; ndx < workGroupCount; ++ndx)
2062         ref += baseValue + ndx;
2063 
2064     if (res != ref)
2065     {
2066         std::ostringstream msg;
2067         msg << "ERROR: comparison failed, expected " << ref << ", got " << res;
2068         return tcu::TestStatus::fail(msg.str());
2069     }
2070     return tcu::TestStatus::pass("Compute succeeded");
2071 }
2072 
2073 class ImageAtomicOpTest : public vkt::TestCase
2074 {
2075 public:
2076     ImageAtomicOpTest(tcu::TestContext &testCtx, const std::string &name, const uint32_t localSize,
2077                       const tcu::IVec2 &imageSize,
2078                       const vk::ComputePipelineConstructionType computePipelineConstructionType);
2079 
2080     virtual void checkSupport(Context &context) const;
2081     void initPrograms(SourceCollections &sourceCollections) const;
2082     TestInstance *createInstance(Context &context) const;
2083 
2084 private:
2085     const uint32_t m_localSize;
2086     const tcu::IVec2 m_imageSize;
2087     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
2088 };
2089 
2090 class ImageAtomicOpTestInstance : public vkt::TestInstance
2091 {
2092 public:
2093     ImageAtomicOpTestInstance(Context &context, const uint32_t localSize, const tcu::IVec2 &imageSize,
2094                               const vk::ComputePipelineConstructionType computePipelineConstructionType);
2095 
2096     tcu::TestStatus iterate(void);
2097 
2098 private:
2099     const uint32_t m_localSize;
2100     const tcu::IVec2 m_imageSize;
2101     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
2102 };
2103 
ImageAtomicOpTest(tcu::TestContext & testCtx,const std::string & name,const uint32_t localSize,const tcu::IVec2 & imageSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)2104 ImageAtomicOpTest::ImageAtomicOpTest(tcu::TestContext &testCtx, const std::string &name, const uint32_t localSize,
2105                                      const tcu::IVec2 &imageSize,
2106                                      const vk::ComputePipelineConstructionType computePipelineConstructionType)
2107     : TestCase(testCtx, name)
2108     , m_localSize(localSize)
2109     , m_imageSize(imageSize)
2110     , m_computePipelineConstructionType(computePipelineConstructionType)
2111 {
2112 }
2113 
checkSupport(Context & context) const2114 void ImageAtomicOpTest::checkSupport(Context &context) const
2115 {
2116     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
2117                                   m_computePipelineConstructionType);
2118 }
2119 
initPrograms(SourceCollections & sourceCollections) const2120 void ImageAtomicOpTest::initPrograms(SourceCollections &sourceCollections) const
2121 {
2122     std::ostringstream src;
2123     src << "#version 310 es\n"
2124         << "#extension GL_OES_shader_image_atomic : require\n"
2125         << "layout (local_size_x = " << m_localSize << ") in;\n"
2126         << "layout(binding = 1, r32ui) coherent uniform highp uimage2D u_dstImg;\n"
2127         << "layout(binding = 0) readonly buffer Input {\n"
2128         << "    uint values[" << (multiplyComponents(m_imageSize) * m_localSize) << "];\n"
2129         << "} sb_in;\n\n"
2130         << "void main (void) {\n"
2131         << "    uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
2132         << "    uint value  = sb_in.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x];\n"
2133         << "\n"
2134         << "    if (gl_LocalInvocationIndex == 0u)\n"
2135         << "        imageStore(u_dstImg, ivec2(gl_WorkGroupID.xy), uvec4(0));\n"
2136         << "    memoryBarrierImage();\n"
2137         << "    barrier();\n"
2138         << "    imageAtomicAdd(u_dstImg, ivec2(gl_WorkGroupID.xy), value);\n"
2139         << "}\n";
2140 
2141     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
2142 }
2143 
createInstance(Context & context) const2144 TestInstance *ImageAtomicOpTest::createInstance(Context &context) const
2145 {
2146     return new ImageAtomicOpTestInstance(context, m_localSize, m_imageSize, m_computePipelineConstructionType);
2147 }
2148 
ImageAtomicOpTestInstance(Context & context,const uint32_t localSize,const tcu::IVec2 & imageSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)2149 ImageAtomicOpTestInstance::ImageAtomicOpTestInstance(
2150     Context &context, const uint32_t localSize, const tcu::IVec2 &imageSize,
2151     const vk::ComputePipelineConstructionType computePipelineConstructionType)
2152     : TestInstance(context)
2153     , m_localSize(localSize)
2154     , m_imageSize(imageSize)
2155     , m_computePipelineConstructionType(computePipelineConstructionType)
2156 {
2157 }
2158 
iterate(void)2159 tcu::TestStatus ImageAtomicOpTestInstance::iterate(void)
2160 {
2161     const DeviceInterface &vk       = m_context.getDeviceInterface();
2162     const VkDevice device           = m_context.getDevice();
2163     const VkQueue queue             = m_context.getUniversalQueue();
2164     const uint32_t queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
2165     Allocator &allocator            = m_context.getDefaultAllocator();
2166 
2167     // Create an image
2168 
2169     const VkImageCreateInfo imageParams =
2170         make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
2171     const ImageWithMemory image(vk, device, allocator, imageParams, MemoryRequirement::Any);
2172 
2173     const VkImageSubresourceRange subresourceRange =
2174         makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
2175     const Unique<VkImageView> imageView(
2176         makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
2177 
2178     // Input buffer
2179 
2180     const uint32_t numInputValues           = multiplyComponents(m_imageSize) * m_localSize;
2181     const VkDeviceSize inputBufferSizeBytes = sizeof(uint32_t) * numInputValues;
2182 
2183     const BufferWithMemory inputBuffer(vk, device, allocator,
2184                                        makeBufferCreateInfo(inputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
2185                                        MemoryRequirement::HostVisible);
2186 
2187     // Populate the input buffer with test data
2188     {
2189         de::Random rnd(0x77238ac2);
2190         const Allocation &inputBufferAllocation = inputBuffer.getAllocation();
2191         uint32_t *bufferPtr                     = static_cast<uint32_t *>(inputBufferAllocation.getHostPtr());
2192         for (uint32_t i = 0; i < numInputValues; ++i)
2193             *bufferPtr++ = rnd.getUint32();
2194 
2195         flushAlloc(vk, device, inputBufferAllocation);
2196     }
2197 
2198     // Create a buffer to store shader output (copied from image data)
2199 
2200     const uint32_t imageArea                 = multiplyComponents(m_imageSize);
2201     const VkDeviceSize outputBufferSizeBytes = sizeof(uint32_t) * imageArea;
2202     const BufferWithMemory outputBuffer(vk, device, allocator,
2203                                         makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT),
2204                                         MemoryRequirement::HostVisible);
2205 
2206     // Create descriptor set
2207 
2208     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2209         DescriptorSetLayoutBuilder()
2210             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2211             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
2212             .build(vk, device));
2213 
2214     const Unique<VkDescriptorPool> descriptorPool(
2215         DescriptorPoolBuilder()
2216             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2217             .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
2218             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2219 
2220     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2221 
2222     // Set the bindings
2223 
2224     const VkDescriptorImageInfo imageDescriptorInfo =
2225         makeDescriptorImageInfo(VK_NULL_HANDLE, *imageView, VK_IMAGE_LAYOUT_GENERAL);
2226     const VkDescriptorBufferInfo bufferDescriptorInfo =
2227         makeDescriptorBufferInfo(*inputBuffer, 0ull, inputBufferSizeBytes);
2228 
2229     DescriptorSetUpdateBuilder()
2230         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
2231                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2232         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u),
2233                      VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
2234         .update(vk, device);
2235 
2236     // Perform the computation
2237     {
2238         ComputePipelineWrapper pipeline(vk, device, m_computePipelineConstructionType,
2239                                         m_context.getBinaryCollection().get("comp"));
2240         pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
2241         pipeline.buildPipeline();
2242 
2243         const VkBufferMemoryBarrier inputBufferPostHostWriteBarrier = makeBufferMemoryBarrier(
2244             VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, inputBufferSizeBytes);
2245 
2246         const VkImageMemoryBarrier imageLayoutBarrier =
2247             makeImageMemoryBarrier((VkAccessFlags)0, VK_ACCESS_SHADER_WRITE_BIT, VK_IMAGE_LAYOUT_UNDEFINED,
2248                                    VK_IMAGE_LAYOUT_GENERAL, *image, subresourceRange);
2249 
2250         // Prepare the command buffer
2251 
2252         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
2253         const Unique<VkCommandBuffer> cmdBuffer(
2254             allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2255 
2256         // Start recording commands
2257 
2258         beginCommandBuffer(vk, *cmdBuffer);
2259 
2260         pipeline.bind(*cmdBuffer);
2261         vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
2262                                  &descriptorSet.get(), 0u, nullptr);
2263 
2264         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
2265                               (VkDependencyFlags)0, 0, nullptr, 1, &inputBufferPostHostWriteBarrier, 1,
2266                               &imageLayoutBarrier);
2267         vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2268 
2269         copyImageToBuffer(vk, *cmdBuffer, *image, *outputBuffer, m_imageSize, VK_ACCESS_SHADER_WRITE_BIT,
2270                           VK_IMAGE_LAYOUT_GENERAL);
2271 
2272         endCommandBuffer(vk, *cmdBuffer);
2273 
2274         // Wait for completion
2275 
2276         submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2277     }
2278 
2279     // Validate the results
2280 
2281     const Allocation &outputBufferAllocation = outputBuffer.getAllocation();
2282     invalidateAlloc(vk, device, outputBufferAllocation);
2283 
2284     const uint32_t *bufferPtr    = static_cast<uint32_t *>(outputBufferAllocation.getHostPtr());
2285     const uint32_t *refBufferPtr = static_cast<uint32_t *>(inputBuffer.getAllocation().getHostPtr());
2286 
2287     for (uint32_t pixelNdx = 0; pixelNdx < imageArea; ++pixelNdx)
2288     {
2289         const uint32_t res = bufferPtr[pixelNdx];
2290         uint32_t ref       = 0;
2291 
2292         for (uint32_t offs = 0; offs < m_localSize; ++offs)
2293             ref += refBufferPtr[pixelNdx * m_localSize + offs];
2294 
2295         if (res != ref)
2296         {
2297             std::ostringstream msg;
2298             msg << "Comparison failed for pixel " << pixelNdx;
2299             return tcu::TestStatus::fail(msg.str());
2300         }
2301     }
2302     return tcu::TestStatus::pass("Compute succeeded");
2303 }
2304 
2305 class ImageBarrierTest : public vkt::TestCase
2306 {
2307 public:
2308     ImageBarrierTest(tcu::TestContext &testCtx, const std::string &name, const tcu::IVec2 &imageSize,
2309                      const vk::ComputePipelineConstructionType computePipelineConstructionType);
2310 
2311     virtual void checkSupport(Context &context) const;
2312     void initPrograms(SourceCollections &sourceCollections) const;
2313     TestInstance *createInstance(Context &context) const;
2314 
2315 private:
2316     const tcu::IVec2 m_imageSize;
2317     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
2318 };
2319 
2320 class ImageBarrierTestInstance : public vkt::TestInstance
2321 {
2322 public:
2323     ImageBarrierTestInstance(Context &context, const tcu::IVec2 &imageSize,
2324                              const vk::ComputePipelineConstructionType computePipelineConstructionType);
2325 
2326     tcu::TestStatus iterate(void);
2327 
2328 private:
2329     const tcu::IVec2 m_imageSize;
2330     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
2331 };
2332 
ImageBarrierTest(tcu::TestContext & testCtx,const std::string & name,const tcu::IVec2 & imageSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)2333 ImageBarrierTest::ImageBarrierTest(tcu::TestContext &testCtx, const std::string &name, const tcu::IVec2 &imageSize,
2334                                    const vk::ComputePipelineConstructionType computePipelineConstructionType)
2335     : TestCase(testCtx, name)
2336     , m_imageSize(imageSize)
2337     , m_computePipelineConstructionType(computePipelineConstructionType)
2338 {
2339 }
2340 
checkSupport(Context & context) const2341 void ImageBarrierTest::checkSupport(Context &context) const
2342 {
2343     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
2344                                   m_computePipelineConstructionType);
2345 }
2346 
initPrograms(SourceCollections & sourceCollections) const2347 void ImageBarrierTest::initPrograms(SourceCollections &sourceCollections) const
2348 {
2349     sourceCollections.glslSources.add("comp0")
2350         << glu::ComputeSource("#version 310 es\n"
2351                               "layout (local_size_x = 1) in;\n"
2352                               "layout(binding = 2) readonly uniform Constants {\n"
2353                               "    uint u_baseVal;\n"
2354                               "};\n"
2355                               "layout(binding = 1, r32ui) writeonly uniform highp uimage2D u_img;\n"
2356                               "void main (void) {\n"
2357                               "    uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + "
2358                               "gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
2359                               "    imageStore(u_img, ivec2(gl_WorkGroupID.xy), uvec4(offset + u_baseVal, 0, 0, 0));\n"
2360                               "}\n");
2361 
2362     sourceCollections.glslSources.add("comp1")
2363         << glu::ComputeSource("#version 310 es\n"
2364                               "layout (local_size_x = 1) in;\n"
2365                               "layout(binding = 1, r32ui) readonly uniform highp uimage2D u_img;\n"
2366                               "layout(binding = 0) coherent buffer Output {\n"
2367                               "    uint sum;\n"
2368                               "};\n"
2369                               "void main (void) {\n"
2370                               "    uint value = imageLoad(u_img, ivec2(gl_WorkGroupID.xy)).x;\n"
2371                               "    atomicAdd(sum, value);\n"
2372                               "}\n");
2373 }
2374 
createInstance(Context & context) const2375 TestInstance *ImageBarrierTest::createInstance(Context &context) const
2376 {
2377     return new ImageBarrierTestInstance(context, m_imageSize, m_computePipelineConstructionType);
2378 }
2379 
ImageBarrierTestInstance(Context & context,const tcu::IVec2 & imageSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)2380 ImageBarrierTestInstance::ImageBarrierTestInstance(
2381     Context &context, const tcu::IVec2 &imageSize,
2382     const vk::ComputePipelineConstructionType computePipelineConstructionType)
2383     : TestInstance(context)
2384     , m_imageSize(imageSize)
2385     , m_computePipelineConstructionType(computePipelineConstructionType)
2386 {
2387 }
2388 
iterate(void)2389 tcu::TestStatus ImageBarrierTestInstance::iterate(void)
2390 {
2391     const DeviceInterface &vk       = m_context.getDeviceInterface();
2392     const VkDevice device           = m_context.getDevice();
2393     const VkQueue queue             = m_context.getUniversalQueue();
2394     const uint32_t queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
2395     Allocator &allocator            = m_context.getDefaultAllocator();
2396 
2397     // Create an image used by both shaders
2398 
2399     const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_STORAGE_BIT);
2400     const ImageWithMemory image(vk, device, allocator, imageParams, MemoryRequirement::Any);
2401 
2402     const VkImageSubresourceRange subresourceRange =
2403         makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
2404     const Unique<VkImageView> imageView(
2405         makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
2406 
2407     // Create an output buffer
2408 
2409     const VkDeviceSize outputBufferSizeBytes = sizeof(uint32_t);
2410     const BufferWithMemory outputBuffer(vk, device, allocator,
2411                                         makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
2412                                         MemoryRequirement::HostVisible);
2413 
2414     // Initialize atomic counter value to zero
2415     {
2416         const Allocation &outputBufferAllocation = outputBuffer.getAllocation();
2417         uint32_t *outputBufferPtr                = static_cast<uint32_t *>(outputBufferAllocation.getHostPtr());
2418         *outputBufferPtr                         = 0;
2419         flushAlloc(vk, device, outputBufferAllocation);
2420     }
2421 
2422     // Create a uniform buffer (to pass uniform constants)
2423 
2424     const VkDeviceSize uniformBufferSizeBytes = sizeof(uint32_t);
2425     const BufferWithMemory uniformBuffer(
2426         vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT),
2427         MemoryRequirement::HostVisible);
2428 
2429     // Set the constants in the uniform buffer
2430 
2431     const uint32_t baseValue = 127;
2432     {
2433         const Allocation &uniformBufferAllocation = uniformBuffer.getAllocation();
2434         uint32_t *uniformBufferPtr                = static_cast<uint32_t *>(uniformBufferAllocation.getHostPtr());
2435         uniformBufferPtr[0]                       = baseValue;
2436 
2437         flushAlloc(vk, device, uniformBufferAllocation);
2438     }
2439 
2440     // Create descriptor set
2441 
2442     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2443         DescriptorSetLayoutBuilder()
2444             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2445             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
2446             .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2447             .build(vk, device));
2448 
2449     const Unique<VkDescriptorPool> descriptorPool(
2450         DescriptorPoolBuilder()
2451             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2452             .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
2453             .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2454             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2455 
2456     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2457 
2458     const VkDescriptorImageInfo imageDescriptorInfo =
2459         makeDescriptorImageInfo(VK_NULL_HANDLE, *imageView, VK_IMAGE_LAYOUT_GENERAL);
2460     const VkDescriptorBufferInfo outputBufferDescriptorInfo =
2461         makeDescriptorBufferInfo(*outputBuffer, 0ull, outputBufferSizeBytes);
2462     const VkDescriptorBufferInfo uniformBufferDescriptorInfo =
2463         makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2464     DescriptorSetUpdateBuilder()
2465         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
2466                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
2467         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u),
2468                      VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
2469         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(2u),
2470                      VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2471         .update(vk, device);
2472 
2473     // Perform the computation
2474 
2475     ComputePipelineWrapper pipeline0(vk, device, m_computePipelineConstructionType,
2476                                      m_context.getBinaryCollection().get("comp0"));
2477     pipeline0.setDescriptorSetLayout(descriptorSetLayout.get());
2478     pipeline0.buildPipeline();
2479     ComputePipelineWrapper pipeline1(vk, device, m_computePipelineConstructionType,
2480                                      m_context.getBinaryCollection().get("comp1"));
2481     pipeline1.setDescriptorSetLayout(descriptorSetLayout.get());
2482     pipeline1.buildPipeline();
2483 
2484     const VkBufferMemoryBarrier writeUniformConstantsBarrier = makeBufferMemoryBarrier(
2485         VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2486 
2487     const VkImageMemoryBarrier imageLayoutBarrier =
2488         makeImageMemoryBarrier(0u, 0u, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL, *image, subresourceRange);
2489 
2490     const VkImageMemoryBarrier imageBarrierBetweenShaders =
2491         makeImageMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, VK_IMAGE_LAYOUT_GENERAL,
2492                                VK_IMAGE_LAYOUT_GENERAL, *image, subresourceRange);
2493 
2494     const VkBufferMemoryBarrier afterComputeBarrier = makeBufferMemoryBarrier(
2495         VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, outputBufferSizeBytes);
2496 
2497     const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
2498     const Unique<VkCommandBuffer> cmdBuffer(
2499         allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2500 
2501     // Start recording commands
2502 
2503     beginCommandBuffer(vk, *cmdBuffer);
2504 
2505     pipeline0.bind(*cmdBuffer);
2506     vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline0.getPipelineLayout(), 0u, 1u,
2507                              &descriptorSet.get(), 0u, nullptr);
2508 
2509     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
2510                           (VkDependencyFlags)0, 0, nullptr, 1, &writeUniformConstantsBarrier, 1, &imageLayoutBarrier);
2511 
2512     vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2513     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
2514                           (VkDependencyFlags)0, 0, nullptr, 0, nullptr, 1, &imageBarrierBetweenShaders);
2515 
2516     // Switch to the second shader program
2517     pipeline1.bind(*cmdBuffer);
2518 
2519     vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2520     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
2521                           (VkDependencyFlags)0, 0, nullptr, 1, &afterComputeBarrier, 0, nullptr);
2522 
2523     endCommandBuffer(vk, *cmdBuffer);
2524 
2525     // Wait for completion
2526 
2527     submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2528 
2529     // Validate the results
2530 
2531     const Allocation &outputBufferAllocation = outputBuffer.getAllocation();
2532     invalidateAlloc(vk, device, outputBufferAllocation);
2533 
2534     const int numValues       = multiplyComponents(m_imageSize);
2535     const uint32_t *bufferPtr = static_cast<uint32_t *>(outputBufferAllocation.getHostPtr());
2536     const uint32_t res        = *bufferPtr;
2537     uint32_t ref              = 0;
2538 
2539     for (int ndx = 0; ndx < numValues; ++ndx)
2540         ref += baseValue + ndx;
2541 
2542     if (res != ref)
2543     {
2544         std::ostringstream msg;
2545         msg << "ERROR: comparison failed, expected " << ref << ", got " << res;
2546         return tcu::TestStatus::fail(msg.str());
2547     }
2548     return tcu::TestStatus::pass("Compute succeeded");
2549 }
2550 
2551 class ComputeTestInstance : public vkt::TestInstance
2552 {
2553 public:
ComputeTestInstance(Context & context,vk::ComputePipelineConstructionType computePipelineConstructionType,bool useMaintenance5)2554     ComputeTestInstance(Context &context, vk::ComputePipelineConstructionType computePipelineConstructionType,
2555                         bool useMaintenance5)
2556         : TestInstance(context)
2557         , m_numPhysDevices(1)
2558         , m_queueFamilyIndex(0)
2559         , m_computePipelineConstructionType(computePipelineConstructionType)
2560         , m_maintenance5(useMaintenance5)
2561     {
2562         createDeviceGroup();
2563     }
2564 
~ComputeTestInstance()2565     ~ComputeTestInstance()
2566     {
2567     }
2568 
2569     void createDeviceGroup(void);
getDeviceInterface(void)2570     const vk::DeviceInterface &getDeviceInterface(void)
2571     {
2572         return *m_deviceDriver;
2573     }
getInstance(void)2574     vk::VkInstance getInstance(void)
2575     {
2576         return m_deviceGroupInstance;
2577     }
getDevice(void)2578     vk::VkDevice getDevice(void)
2579     {
2580         return *m_logicalDevice;
2581     }
getPhysicalDevice(uint32_t i=0)2582     vk::VkPhysicalDevice getPhysicalDevice(uint32_t i = 0)
2583     {
2584         return m_physicalDevices[i];
2585     }
2586 
2587 protected:
2588     uint32_t m_numPhysDevices;
2589     uint32_t m_queueFamilyIndex;
2590     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
2591     bool m_maintenance5;
2592 
2593 private:
2594     CustomInstance m_deviceGroupInstance;
2595     vk::Move<vk::VkDevice> m_logicalDevice;
2596     std::vector<vk::VkPhysicalDevice> m_physicalDevices;
2597 #ifndef CTS_USES_VULKANSC
2598     de::MovePtr<vk::DeviceDriver> m_deviceDriver;
2599 #else
2600     de::MovePtr<vk::DeviceDriverSC, vk::DeinitDeviceDeleter> m_deviceDriver;
2601 #endif // CTS_USES_VULKANSC
2602 };
2603 
createDeviceGroup(void)2604 void ComputeTestInstance::createDeviceGroup(void)
2605 {
2606     const tcu::CommandLine &cmdLine = m_context.getTestContext().getCommandLine();
2607     const uint32_t devGroupIdx      = cmdLine.getVKDeviceGroupId() - 1;
2608     const uint32_t physDeviceIdx    = cmdLine.getVKDeviceId() - 1;
2609     const float queuePriority       = 1.0f;
2610     const std::vector<std::string> requiredExtensions(1, "VK_KHR_device_group_creation");
2611     m_deviceGroupInstance = createCustomInstanceWithExtensions(m_context, requiredExtensions);
2612     std::vector<VkPhysicalDeviceGroupProperties> devGroupProperties =
2613         enumeratePhysicalDeviceGroups(m_context.getInstanceInterface(), m_deviceGroupInstance);
2614     m_numPhysDevices = devGroupProperties[devGroupIdx].physicalDeviceCount;
2615     std::vector<const char *> deviceExtensions;
2616 
2617     if (!isCoreDeviceExtension(m_context.getUsedApiVersion(), "VK_KHR_device_group"))
2618         deviceExtensions.push_back("VK_KHR_device_group");
2619 
2620     if (m_maintenance5)
2621         deviceExtensions.push_back("VK_KHR_maintenance5");
2622 
2623     //m_ma
2624 
2625     VkDeviceGroupDeviceCreateInfo deviceGroupInfo = {
2626         VK_STRUCTURE_TYPE_DEVICE_GROUP_DEVICE_CREATE_INFO,   //stype
2627         nullptr,                                             //pNext
2628         devGroupProperties[devGroupIdx].physicalDeviceCount, //physicalDeviceCount
2629         devGroupProperties[devGroupIdx].physicalDevices      //physicalDevices
2630     };
2631     const InstanceDriver &instance(m_deviceGroupInstance.getDriver());
2632     VkPhysicalDeviceFeatures2 deviceFeatures2 = initVulkanStructure();
2633     const VkPhysicalDeviceFeatures deviceFeatures =
2634         getPhysicalDeviceFeatures(instance, deviceGroupInfo.pPhysicalDevices[physDeviceIdx]);
2635     const std::vector<VkQueueFamilyProperties> queueProps = getPhysicalDeviceQueueFamilyProperties(
2636         instance, devGroupProperties[devGroupIdx].physicalDevices[physDeviceIdx]);
2637 
2638     deviceFeatures2.features = deviceFeatures;
2639 
2640 #ifndef CTS_USES_VULKANSC
2641     VkPhysicalDeviceDynamicRenderingFeaturesKHR dynamicRenderingFeatures = initVulkanStructure();
2642     dynamicRenderingFeatures.dynamicRendering                            = VK_TRUE;
2643     VkPhysicalDeviceShaderObjectFeaturesEXT shaderObjectFeatures = initVulkanStructure(&dynamicRenderingFeatures);
2644     shaderObjectFeatures.shaderObject                            = VK_TRUE;
2645     if (m_computePipelineConstructionType)
2646     {
2647         deviceExtensions.push_back("VK_EXT_shader_object");
2648         deviceFeatures2.pNext = &shaderObjectFeatures;
2649     }
2650 #endif
2651 
2652     m_physicalDevices.resize(m_numPhysDevices);
2653     for (uint32_t physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2654         m_physicalDevices[physDevIdx] = devGroupProperties[devGroupIdx].physicalDevices[physDevIdx];
2655 
2656     for (size_t queueNdx = 0; queueNdx < queueProps.size(); queueNdx++)
2657     {
2658         if (queueProps[queueNdx].queueFlags & VK_QUEUE_COMPUTE_BIT)
2659             m_queueFamilyIndex = (uint32_t)queueNdx;
2660     }
2661 
2662     VkDeviceQueueCreateInfo queueInfo = {
2663         VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, // VkStructureType sType;
2664         nullptr,                                    // const void* pNext;
2665         (VkDeviceQueueCreateFlags)0u,               // VkDeviceQueueCreateFlags flags;
2666         m_queueFamilyIndex,                         // uint32_t queueFamilyIndex;
2667         1u,                                         // uint32_t queueCount;
2668         &queuePriority                              // const float* pQueuePriorities;
2669     };
2670 
2671     void *pNext = &deviceGroupInfo;
2672     if (deviceFeatures2.pNext != nullptr)
2673         deviceGroupInfo.pNext = &deviceFeatures2;
2674 
2675 #ifdef CTS_USES_VULKANSC
2676     VkDeviceObjectReservationCreateInfo memReservationInfo = cmdLine.isSubProcess() ?
2677                                                                  m_context.getResourceInterface()->getStatMax() :
2678                                                                  resetDeviceObjectReservationCreateInfo();
2679     memReservationInfo.pNext                               = pNext;
2680     pNext                                                  = &memReservationInfo;
2681 
2682     VkPhysicalDeviceVulkanSC10Features sc10Features = createDefaultSC10Features();
2683     sc10Features.pNext                              = pNext;
2684     pNext                                           = &sc10Features;
2685     VkPipelineCacheCreateInfo pcCI;
2686     std::vector<VkPipelinePoolSize> poolSizes;
2687     if (cmdLine.isSubProcess())
2688     {
2689         if (m_context.getResourceInterface()->getCacheDataSize() > 0)
2690         {
2691             pcCI = {
2692                 VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO, // VkStructureType sType;
2693                 nullptr,                                      // const void* pNext;
2694                 VK_PIPELINE_CACHE_CREATE_READ_ONLY_BIT |
2695                     VK_PIPELINE_CACHE_CREATE_USE_APPLICATION_STORAGE_BIT, // VkPipelineCacheCreateFlags flags;
2696                 m_context.getResourceInterface()->getCacheDataSize(),     // uintptr_t initialDataSize;
2697                 m_context.getResourceInterface()->getCacheData()          // const void* pInitialData;
2698             };
2699             memReservationInfo.pipelineCacheCreateInfoCount = 1;
2700             memReservationInfo.pPipelineCacheCreateInfos    = &pcCI;
2701         }
2702 
2703         poolSizes = m_context.getResourceInterface()->getPipelinePoolSizes();
2704         if (!poolSizes.empty())
2705         {
2706             memReservationInfo.pipelinePoolSizeCount = uint32_t(poolSizes.size());
2707             memReservationInfo.pPipelinePoolSizes    = poolSizes.data();
2708         }
2709     }
2710 
2711 #endif // CTS_USES_VULKANSC
2712 
2713     const VkDeviceCreateInfo deviceInfo = {
2714         VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, // VkStructureType sType;
2715         pNext,                                // const void* pNext;
2716         (VkDeviceCreateFlags)0,               // VkDeviceCreateFlags flags;
2717         1u,                                   // uint32_t queueCreateInfoCount;
2718         &queueInfo,                           // const VkDeviceQueueCreateInfo* pQueueCreateInfos;
2719         0u,                                   // uint32_t enabledLayerCount;
2720         nullptr,                              // const char* const* ppEnabledLayerNames;
2721         uint32_t(deviceExtensions.size()),    // uint32_t enabledExtensionCount;
2722         (deviceExtensions.empty() ? nullptr : &deviceExtensions[0]), // const char* const* ppEnabledExtensionNames;
2723         deviceFeatures2.pNext == nullptr ? &deviceFeatures :
2724                                            nullptr, // const VkPhysicalDeviceFeatures* pEnabledFeatures;
2725     };
2726 
2727     m_logicalDevice = createCustomDevice(m_context.getTestContext().getCommandLine().isValidationEnabled(),
2728                                          m_context.getPlatformInterface(), m_deviceGroupInstance, instance,
2729                                          deviceGroupInfo.pPhysicalDevices[physDeviceIdx], &deviceInfo);
2730 #ifndef CTS_USES_VULKANSC
2731     m_deviceDriver = de::MovePtr<DeviceDriver>(new DeviceDriver(m_context.getPlatformInterface(), m_deviceGroupInstance,
2732                                                                 *m_logicalDevice, m_context.getUsedApiVersion(),
2733                                                                 m_context.getTestContext().getCommandLine()));
2734 #else
2735     m_deviceDriver = de::MovePtr<DeviceDriverSC, DeinitDeviceDeleter>(
2736         new DeviceDriverSC(m_context.getPlatformInterface(), m_context.getInstance(), *m_logicalDevice,
2737                            m_context.getTestContext().getCommandLine(), m_context.getResourceInterface(),
2738                            m_context.getDeviceVulkanSC10Properties(), m_context.getDeviceProperties(),
2739                            m_context.getUsedApiVersion()),
2740         vk::DeinitDeviceDeleter(m_context.getResourceInterface().get(), *m_logicalDevice));
2741 #endif // CTS_USES_VULKANSC
2742 }
2743 
2744 class DispatchBaseTest : public vkt::TestCase
2745 {
2746 public:
2747     DispatchBaseTest(tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues,
2748                      const tcu::IVec3 &localsize, const tcu::IVec3 &worksize, const tcu::IVec3 &splitsize,
2749                      const vk::ComputePipelineConstructionType computePipelineConstructionType,
2750                      const bool useMaintenance5);
2751 
2752     virtual void checkSupport(Context &context) const;
2753     void initPrograms(SourceCollections &sourceCollections) const;
2754     TestInstance *createInstance(Context &context) const;
2755 
2756 private:
2757     const uint32_t m_numValues;
2758     const tcu::IVec3 m_localSize;
2759     const tcu::IVec3 m_workSize;
2760     const tcu::IVec3 m_splitSize;
2761     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
2762     const bool m_useMaintenance5;
2763 };
2764 
2765 class DispatchBaseTestInstance : public ComputeTestInstance
2766 {
2767 public:
2768     DispatchBaseTestInstance(Context &context, const uint32_t numValues, const tcu::IVec3 &localsize,
2769                              const tcu::IVec3 &worksize, const tcu::IVec3 &splitsize,
2770                              const vk::ComputePipelineConstructionType computePipelineConstructionType,
2771                              const bool useMaintenance5);
2772 
2773     bool isInputVectorValid(const tcu::IVec3 &small, const tcu::IVec3 &big);
2774     tcu::TestStatus iterate(void);
2775 
2776 private:
2777     const uint32_t m_numValues;
2778     const tcu::IVec3 m_localSize;
2779     const tcu::IVec3 m_workSize;
2780     const tcu::IVec3 m_splitWorkSize;
2781     const bool m_useMaintenance5;
2782 };
2783 
DispatchBaseTest(tcu::TestContext & testCtx,const std::string & name,const uint32_t numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize,const tcu::IVec3 & splitsize,const vk::ComputePipelineConstructionType computePipelineConstructionType,const bool useMaintenance5)2784 DispatchBaseTest::DispatchBaseTest(tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues,
2785                                    const tcu::IVec3 &localsize, const tcu::IVec3 &worksize, const tcu::IVec3 &splitsize,
2786                                    const vk::ComputePipelineConstructionType computePipelineConstructionType,
2787                                    const bool useMaintenance5)
2788     : TestCase(testCtx, name)
2789     , m_numValues(numValues)
2790     , m_localSize(localsize)
2791     , m_workSize(worksize)
2792     , m_splitSize(splitsize)
2793     , m_computePipelineConstructionType(computePipelineConstructionType)
2794     , m_useMaintenance5(useMaintenance5)
2795 {
2796 }
2797 
checkSupport(Context & context) const2798 void DispatchBaseTest::checkSupport(Context &context) const
2799 {
2800     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
2801                                   m_computePipelineConstructionType);
2802     if (m_useMaintenance5)
2803         context.requireDeviceFunctionality("VK_KHR_maintenance5");
2804 }
2805 
initPrograms(SourceCollections & sourceCollections) const2806 void DispatchBaseTest::initPrograms(SourceCollections &sourceCollections) const
2807 {
2808     std::ostringstream src;
2809     src << "#version 310 es\n"
2810         << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y()
2811         << ", local_size_z = " << m_localSize.z() << ") in;\n"
2812 
2813         << "layout(binding = 0) buffer InOut {\n"
2814         << "    uint values[" << de::toString(m_numValues) << "];\n"
2815         << "} sb_inout;\n"
2816 
2817         << "layout(binding = 1) readonly uniform uniformInput {\n"
2818         << "    uvec3 gridSize;\n"
2819         << "} ubo_in;\n"
2820 
2821         << "void main (void) {\n"
2822         << "    uvec3 size = ubo_in.gridSize * gl_WorkGroupSize;\n"
2823         << "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
2824         << "    uint index = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + "
2825            "gl_GlobalInvocationID.x;\n"
2826         << "    uint offset = numValuesPerInv*index;\n"
2827         << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
2828         << "        sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
2829         << "}\n";
2830 
2831     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
2832 }
2833 
createInstance(Context & context) const2834 TestInstance *DispatchBaseTest::createInstance(Context &context) const
2835 {
2836     return new DispatchBaseTestInstance(context, m_numValues, m_localSize, m_workSize, m_splitSize,
2837                                         m_computePipelineConstructionType, m_useMaintenance5);
2838 }
2839 
DispatchBaseTestInstance(Context & context,const uint32_t numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize,const tcu::IVec3 & splitsize,const vk::ComputePipelineConstructionType computePipelineConstructionType,const bool useMaintenance5)2840 DispatchBaseTestInstance::DispatchBaseTestInstance(
2841     Context &context, const uint32_t numValues, const tcu::IVec3 &localsize, const tcu::IVec3 &worksize,
2842     const tcu::IVec3 &splitsize, const vk::ComputePipelineConstructionType computePipelineConstructionType,
2843     const bool useMaintenance5)
2844 
2845     : ComputeTestInstance(context, computePipelineConstructionType, useMaintenance5)
2846     , m_numValues(numValues)
2847     , m_localSize(localsize)
2848     , m_workSize(worksize)
2849     , m_splitWorkSize(splitsize)
2850     , m_useMaintenance5(useMaintenance5)
2851 {
2852     // For easy work distribution across physical devices:
2853     // WorkSize should be a multiple of SplitWorkSize only in the X component
2854     if ((!isInputVectorValid(m_splitWorkSize, m_workSize)) || (m_workSize.x() <= m_splitWorkSize.x()) ||
2855         (m_workSize.y() != m_splitWorkSize.y()) || (m_workSize.z() != m_splitWorkSize.z()))
2856         TCU_THROW(TestError, "Invalid Input.");
2857 
2858     // For easy work distribution within the same physical device:
2859     // SplitWorkSize should be a multiple of localSize in Y or Z component
2860     if ((!isInputVectorValid(m_localSize, m_splitWorkSize)) || (m_localSize.x() != m_splitWorkSize.x()) ||
2861         (m_localSize.y() >= m_splitWorkSize.y()) || (m_localSize.z() >= m_splitWorkSize.z()))
2862         TCU_THROW(TestError, "Invalid Input.");
2863 
2864     if ((multiplyComponents(m_workSize) / multiplyComponents(m_splitWorkSize)) < (int32_t)m_numPhysDevices)
2865         TCU_THROW(TestError, "Not enough work to distribute across all physical devices.");
2866 
2867     uint32_t totalWork = multiplyComponents(m_workSize) * multiplyComponents(m_localSize);
2868     if ((totalWork > numValues) || (numValues % totalWork != 0))
2869         TCU_THROW(TestError, "Buffer too small/not aligned to cover all values.");
2870 }
2871 
isInputVectorValid(const tcu::IVec3 & small,const tcu::IVec3 & big)2872 bool DispatchBaseTestInstance::isInputVectorValid(const tcu::IVec3 &small, const tcu::IVec3 &big)
2873 {
2874     if (((big.x() < small.x()) || (big.y() < small.y()) || (big.z() < small.z())) ||
2875         ((big.x() % small.x() != 0) || (big.y() % small.y() != 0) || (big.z() % small.z() != 0)))
2876         return false;
2877     return true;
2878 }
2879 
iterate(void)2880 tcu::TestStatus DispatchBaseTestInstance::iterate(void)
2881 {
2882     const DeviceInterface &vk = getDeviceInterface();
2883     const VkDevice device     = getDevice();
2884     const VkQueue queue       = getDeviceQueue(vk, device, m_queueFamilyIndex, 0);
2885     SimpleAllocator allocator(vk, device,
2886                               getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice()));
2887     uint32_t totalWorkloadSize = 0;
2888 
2889     // Create an uniform and input/output buffer
2890     const uint32_t uniformBufSize             = 3; // Pass the compute grid size
2891     const VkDeviceSize uniformBufferSizeBytes = sizeof(uint32_t) * uniformBufSize;
2892     const BufferWithMemory uniformBuffer(
2893         vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT),
2894         MemoryRequirement::HostVisible);
2895 
2896     const VkDeviceSize bufferSizeBytes = sizeof(uint32_t) * m_numValues;
2897     const BufferWithMemory buffer(vk, device, allocator,
2898                                   makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
2899                                   MemoryRequirement::HostVisible);
2900 
2901     // Fill the buffers with data
2902     typedef std::vector<uint32_t> data_vector_t;
2903     data_vector_t uniformInputData(uniformBufSize);
2904     data_vector_t inputData(m_numValues);
2905 
2906     {
2907         const Allocation &bufferAllocation = uniformBuffer.getAllocation();
2908         uint32_t *bufferPtr                = static_cast<uint32_t *>(bufferAllocation.getHostPtr());
2909         uniformInputData[0] = *bufferPtr++ = m_workSize.x();
2910         uniformInputData[1] = *bufferPtr++ = m_workSize.y();
2911         uniformInputData[2] = *bufferPtr++ = m_workSize.z();
2912         flushAlloc(vk, device, bufferAllocation);
2913     }
2914 
2915     {
2916         de::Random rnd(0x82ce7f);
2917         const Allocation &bufferAllocation = buffer.getAllocation();
2918         uint32_t *bufferPtr                = static_cast<uint32_t *>(bufferAllocation.getHostPtr());
2919         for (uint32_t i = 0; i < m_numValues; ++i)
2920             inputData[i] = *bufferPtr++ = rnd.getUint32();
2921 
2922         flushAlloc(vk, device, bufferAllocation);
2923     }
2924 
2925     // Create descriptor set
2926     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2927         DescriptorSetLayoutBuilder()
2928             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2929             .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2930             .build(vk, device));
2931 
2932     const Unique<VkDescriptorPool> descriptorPool(
2933         DescriptorPoolBuilder()
2934             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2935             .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2936             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2937 
2938     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2939 
2940     const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
2941     const VkDescriptorBufferInfo uniformBufferDescriptorInfo =
2942         makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2943 
2944     DescriptorSetUpdateBuilder()
2945         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
2946                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2947         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u),
2948                      VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2949         .update(vk, device);
2950 
2951     ComputePipelineWrapper pipeline(vk, device, m_computePipelineConstructionType,
2952                                     m_context.getBinaryCollection().get("comp"));
2953     pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
2954     pipeline.setPipelineCreateFlags(VK_PIPELINE_CREATE_DISPATCH_BASE);
2955 
2956 #ifndef CTS_USES_VULKANSC
2957     VkPipelineCreateFlags2CreateInfoKHR pipelineFlags2CreateInfo = initVulkanStructure();
2958     if (m_useMaintenance5)
2959     {
2960         pipelineFlags2CreateInfo.flags = VK_PIPELINE_CREATE_2_DISPATCH_BASE_BIT_KHR;
2961         pipeline.setPipelineCreatePNext(&pipelineFlags2CreateInfo);
2962         pipeline.setPipelineCreateFlags(0);
2963     }
2964 #else
2965     DE_UNREF(m_useMaintenance5);
2966 #endif // CTS_USES_VULKANSC
2967 
2968     pipeline.buildPipeline();
2969 
2970     const VkBufferMemoryBarrier hostWriteBarrier =
2971         makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer, 0ull, bufferSizeBytes);
2972     const VkBufferMemoryBarrier hostUniformWriteBarrier = makeBufferMemoryBarrier(
2973         VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2974 
2975     const VkBufferMemoryBarrier shaderWriteBarrier =
2976         makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
2977 
2978     const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, m_queueFamilyIndex));
2979     const Unique<VkCommandBuffer> cmdBuffer(
2980         allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2981 
2982     // Start recording commands
2983     beginCommandBuffer(vk, *cmdBuffer);
2984 
2985     pipeline.bind(*cmdBuffer);
2986     vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
2987                              &descriptorSet.get(), 0u, nullptr);
2988 
2989     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
2990                           (VkDependencyFlags)0, 0, nullptr, 1, &hostUniformWriteBarrier, 0, nullptr);
2991 
2992     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
2993                           (VkDependencyFlags)0, 0, nullptr, 1, &hostWriteBarrier, 0, nullptr);
2994 
2995     // Split the workload across all physical devices based on m_splitWorkSize.x()
2996     for (uint32_t physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2997     {
2998         uint32_t baseGroupX = physDevIdx * m_splitWorkSize.x();
2999         uint32_t baseGroupY = 0;
3000         uint32_t baseGroupZ = 0;
3001 
3002         // Split the workload within the physical device based on m_localSize.y() and m_localSize.z()
3003         for (int32_t localIdxY = 0; localIdxY < (m_splitWorkSize.y() / m_localSize.y()); localIdxY++)
3004         {
3005             for (int32_t localIdxZ = 0; localIdxZ < (m_splitWorkSize.z() / m_localSize.z()); localIdxZ++)
3006             {
3007                 uint32_t offsetX = baseGroupX;
3008                 uint32_t offsetY = baseGroupY + localIdxY * m_localSize.y();
3009                 uint32_t offsetZ = baseGroupZ + localIdxZ * m_localSize.z();
3010 
3011                 uint32_t localSizeX =
3012                     (physDevIdx == (m_numPhysDevices - 1)) ? m_workSize.x() - baseGroupX : m_localSize.x();
3013                 uint32_t localSizeY = m_localSize.y();
3014                 uint32_t localSizeZ = m_localSize.z();
3015 
3016                 totalWorkloadSize += (localSizeX * localSizeY * localSizeZ);
3017                 vk.cmdDispatchBase(*cmdBuffer, offsetX, offsetY, offsetZ, localSizeX, localSizeY, localSizeZ);
3018             }
3019         }
3020     }
3021 
3022     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
3023                           (VkDependencyFlags)0, 0, nullptr, 1, &shaderWriteBarrier, 0, nullptr);
3024 
3025     endCommandBuffer(vk, *cmdBuffer);
3026     submitCommandsAndWait(vk, device, queue, *cmdBuffer);
3027 
3028     if (totalWorkloadSize != uint32_t(multiplyComponents(m_workSize)))
3029         TCU_THROW(TestError, "Not covering the entire workload.");
3030 
3031     // Validate the results
3032     const Allocation &bufferAllocation = buffer.getAllocation();
3033     invalidateAlloc(vk, device, bufferAllocation);
3034     const uint32_t *bufferPtr = static_cast<uint32_t *>(bufferAllocation.getHostPtr());
3035 
3036     for (uint32_t ndx = 0; ndx < m_numValues; ++ndx)
3037     {
3038         const uint32_t res = bufferPtr[ndx];
3039         const uint32_t ref = ~inputData[ndx];
3040 
3041         if (res != ref)
3042         {
3043             std::ostringstream msg;
3044             msg << "Comparison failed for InOut.values[" << ndx << "]";
3045             return tcu::TestStatus::fail(msg.str());
3046         }
3047     }
3048     return tcu::TestStatus::pass("Compute succeeded");
3049 }
3050 
3051 class DeviceIndexTest : public vkt::TestCase
3052 {
3053 public:
3054     DeviceIndexTest(tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues,
3055                     const tcu::IVec3 &localsize, const tcu::IVec3 &splitsize,
3056                     const vk::ComputePipelineConstructionType computePipelineConstructionType);
3057 
3058     virtual void checkSupport(Context &context) const;
3059     void initPrograms(SourceCollections &sourceCollections) const;
3060     TestInstance *createInstance(Context &context) const;
3061 
3062 private:
3063     const uint32_t m_numValues;
3064     const tcu::IVec3 m_localSize;
3065     const tcu::IVec3 m_workSize;
3066     const tcu::IVec3 m_splitSize;
3067     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
3068 };
3069 
3070 class DeviceIndexTestInstance : public ComputeTestInstance
3071 {
3072 public:
3073     DeviceIndexTestInstance(Context &context, const uint32_t numValues, const tcu::IVec3 &localsize,
3074                             const tcu::IVec3 &worksize,
3075                             const vk::ComputePipelineConstructionType computePipelineConstructionType);
3076     tcu::TestStatus iterate(void);
3077 
3078 private:
3079     const uint32_t m_numValues;
3080     const tcu::IVec3 m_localSize;
3081     tcu::IVec3 m_workSize;
3082 };
3083 
DeviceIndexTest(tcu::TestContext & testCtx,const std::string & name,const uint32_t numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize,const vk::ComputePipelineConstructionType computePipelineConstructionType)3084 DeviceIndexTest::DeviceIndexTest(tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues,
3085                                  const tcu::IVec3 &localsize, const tcu::IVec3 &worksize,
3086                                  const vk::ComputePipelineConstructionType computePipelineConstructionType)
3087     : TestCase(testCtx, name)
3088     , m_numValues(numValues)
3089     , m_localSize(localsize)
3090     , m_workSize(worksize)
3091     , m_computePipelineConstructionType(computePipelineConstructionType)
3092 {
3093 }
3094 
checkSupport(Context & context) const3095 void DeviceIndexTest::checkSupport(Context &context) const
3096 {
3097     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
3098                                   m_computePipelineConstructionType);
3099 }
3100 
initPrograms(SourceCollections & sourceCollections) const3101 void DeviceIndexTest::initPrograms(SourceCollections &sourceCollections) const
3102 {
3103     std::ostringstream src;
3104     src << "#version 310 es\n"
3105         << "#extension GL_EXT_device_group : require\n"
3106         << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y()
3107         << ", local_size_z = " << m_localSize.z() << ") in;\n"
3108 
3109         << "layout(binding = 0) buffer InOut {\n"
3110         << "    uint values[" << de::toString(m_numValues) << "];\n"
3111         << "} sb_inout;\n"
3112 
3113         << "layout(binding = 1) readonly uniform uniformInput {\n"
3114         << "    uint baseOffset[1+" << VK_MAX_DEVICE_GROUP_SIZE << "];\n"
3115         << "} ubo_in;\n"
3116 
3117         << "void main (void) {\n"
3118         << "    uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
3119         << "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
3120         << "    uint index = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + "
3121            "gl_GlobalInvocationID.x;\n"
3122         << "    uint offset = numValuesPerInv*index;\n"
3123         << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
3124         << "        sb_inout.values[offset + ndx] = ubo_in.baseOffset[0] + ubo_in.baseOffset[gl_DeviceIndex + 1];\n"
3125         << "}\n";
3126 
3127     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
3128 }
3129 
createInstance(Context & context) const3130 TestInstance *DeviceIndexTest::createInstance(Context &context) const
3131 {
3132     return new DeviceIndexTestInstance(context, m_numValues, m_localSize, m_workSize,
3133                                        m_computePipelineConstructionType);
3134 }
3135 
DeviceIndexTestInstance(Context & context,const uint32_t numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize,const vk::ComputePipelineConstructionType computePipelineConstructionType)3136 DeviceIndexTestInstance::DeviceIndexTestInstance(
3137     Context &context, const uint32_t numValues, const tcu::IVec3 &localsize, const tcu::IVec3 &worksize,
3138     const vk::ComputePipelineConstructionType computePipelineConstructionType)
3139 
3140     : ComputeTestInstance(context, computePipelineConstructionType, false)
3141     , m_numValues(numValues)
3142     , m_localSize(localsize)
3143     , m_workSize(worksize)
3144 {
3145 }
3146 
iterate(void)3147 tcu::TestStatus DeviceIndexTestInstance::iterate(void)
3148 {
3149     const DeviceInterface &vk = getDeviceInterface();
3150     const VkDevice device     = getDevice();
3151     const VkQueue queue       = getDeviceQueue(vk, device, m_queueFamilyIndex, 0);
3152     SimpleAllocator allocator(vk, device,
3153                               getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice()));
3154     const uint32_t allocDeviceMask = (1 << m_numPhysDevices) - 1;
3155     de::Random rnd(0x82ce7f);
3156     Move<VkBuffer> sboBuffer;
3157     vk::Move<vk::VkDeviceMemory> sboBufferMemory;
3158 
3159     // Create an uniform and output buffer
3160     const uint32_t uniformBufSize             = 4 * (1 + VK_MAX_DEVICE_GROUP_SIZE);
3161     const VkDeviceSize uniformBufferSizeBytes = sizeof(uint32_t) * uniformBufSize;
3162     const BufferWithMemory uniformBuffer(
3163         vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT),
3164         MemoryRequirement::HostVisible);
3165 
3166     const VkDeviceSize bufferSizeBytes = sizeof(uint32_t) * m_numValues;
3167     const BufferWithMemory checkBuffer(vk, device, allocator,
3168                                        makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT),
3169                                        MemoryRequirement::HostVisible);
3170 
3171     // create SBO buffer
3172     {
3173         const VkBufferCreateInfo sboBufferParams = {
3174             VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,                                  // sType
3175             nullptr,                                                               // pNext
3176             0u,                                                                    // flags
3177             (VkDeviceSize)bufferSizeBytes,                                         // size
3178             VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT, // usage
3179             VK_SHARING_MODE_EXCLUSIVE,                                             // sharingMode
3180             1u,                                                                    // queueFamilyIndexCount
3181             &m_queueFamilyIndex,                                                   // pQueueFamilyIndices
3182         };
3183         sboBuffer = createBuffer(vk, device, &sboBufferParams);
3184 
3185         VkMemoryRequirements memReqs = getBufferMemoryRequirements(vk, device, sboBuffer.get());
3186         uint32_t memoryTypeNdx       = 0;
3187         const VkPhysicalDeviceMemoryProperties deviceMemProps =
3188             getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice());
3189         for (memoryTypeNdx = 0; memoryTypeNdx < deviceMemProps.memoryTypeCount; memoryTypeNdx++)
3190         {
3191             if ((memReqs.memoryTypeBits & (1u << memoryTypeNdx)) != 0 &&
3192                 (deviceMemProps.memoryTypes[memoryTypeNdx].propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) ==
3193                     VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)
3194                 break;
3195         }
3196         if (memoryTypeNdx == deviceMemProps.memoryTypeCount)
3197             TCU_THROW(NotSupportedError, "No compatible memory type found");
3198 
3199         const VkMemoryAllocateFlagsInfo allocDeviceMaskInfo = {
3200             VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO, // sType
3201             nullptr,                                      // pNext
3202             VK_MEMORY_ALLOCATE_DEVICE_MASK_BIT,           // flags
3203             allocDeviceMask,                              // deviceMask
3204         };
3205 
3206         VkMemoryAllocateInfo allocInfo = {
3207             VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, // sType
3208             &allocDeviceMaskInfo,                   // pNext
3209             memReqs.size,                           // allocationSize
3210             memoryTypeNdx,                          // memoryTypeIndex
3211         };
3212 
3213         sboBufferMemory = allocateMemory(vk, device, &allocInfo);
3214         VK_CHECK(vk.bindBufferMemory(device, *sboBuffer, sboBufferMemory.get(), 0));
3215     }
3216 
3217     // Fill the buffers with data
3218     typedef std::vector<uint32_t> data_vector_t;
3219     data_vector_t uniformInputData(uniformBufSize, 0);
3220 
3221     {
3222         const Allocation &bufferAllocation = uniformBuffer.getAllocation();
3223         uint32_t *bufferPtr                = static_cast<uint32_t *>(bufferAllocation.getHostPtr());
3224         for (uint32_t i = 0; i < uniformBufSize; ++i)
3225             uniformInputData[i] = *bufferPtr++ = rnd.getUint32() / 10; // divide to prevent overflow in addition
3226 
3227         flushAlloc(vk, device, bufferAllocation);
3228     }
3229 
3230     // Create descriptor set
3231     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
3232         DescriptorSetLayoutBuilder()
3233             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3234             .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3235             .build(vk, device));
3236 
3237     const Unique<VkDescriptorPool> descriptorPool(
3238         DescriptorPoolBuilder()
3239             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
3240             .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
3241             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
3242 
3243     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
3244 
3245     const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*sboBuffer, 0ull, bufferSizeBytes);
3246     const VkDescriptorBufferInfo uniformBufferDescriptorInfo =
3247         makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
3248 
3249     DescriptorSetUpdateBuilder()
3250         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
3251                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
3252         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u),
3253                      VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
3254         .update(vk, device);
3255 
3256     ComputePipelineWrapper pipeline(vk, device, m_computePipelineConstructionType,
3257                                     m_context.getBinaryCollection().get("comp"));
3258     pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
3259     pipeline.buildPipeline();
3260 
3261     const VkBufferMemoryBarrier hostUniformWriteBarrier = makeBufferMemoryBarrier(
3262         VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
3263     const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(
3264         VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT, *sboBuffer, 0ull, bufferSizeBytes);
3265 
3266     const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, m_queueFamilyIndex));
3267     const Unique<VkCommandBuffer> cmdBuffer(
3268         allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3269 
3270     // Verify multiple device masks
3271     for (uint32_t physDevMask = 1; physDevMask < (1u << m_numPhysDevices); physDevMask++)
3272     {
3273         uint32_t constantValPerLoop = 0;
3274         {
3275             const Allocation &bufferAllocation = uniformBuffer.getAllocation();
3276             uint32_t *bufferPtr                = static_cast<uint32_t *>(bufferAllocation.getHostPtr());
3277             constantValPerLoop = *bufferPtr = rnd.getUint32() / 10; // divide to prevent overflow in addition
3278             flushAlloc(vk, device, bufferAllocation);
3279         }
3280         beginCommandBuffer(vk, *cmdBuffer);
3281 
3282         pipeline.bind(*cmdBuffer);
3283         vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
3284                                  &descriptorSet.get(), 0u, nullptr);
3285         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
3286                               (VkDependencyFlags)0, 0, nullptr, 1, &hostUniformWriteBarrier, 0, nullptr);
3287 
3288         vk.cmdSetDeviceMask(*cmdBuffer, physDevMask);
3289         vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
3290 
3291         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
3292                               (VkDependencyFlags)0, 0, nullptr, 1, &shaderWriteBarrier, 0, nullptr);
3293 
3294         endCommandBuffer(vk, *cmdBuffer);
3295         submitCommandsAndWait(vk, device, queue, *cmdBuffer, true, physDevMask);
3296         m_context.resetCommandPoolForVKSC(device, *cmdPool);
3297 
3298         // Validate the results on all physical devices where compute shader was launched
3299         const VkBufferMemoryBarrier srcBufferBarrier = makeBufferMemoryBarrier(
3300             VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT, *sboBuffer, 0ull, bufferSizeBytes);
3301         const VkBufferMemoryBarrier dstBufferBarrier = makeBufferMemoryBarrier(
3302             VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *checkBuffer, 0ull, bufferSizeBytes);
3303         const VkBufferCopy copyParams = {
3304             (VkDeviceSize)0u, // srcOffset
3305             (VkDeviceSize)0u, // dstOffset
3306             bufferSizeBytes   // size
3307         };
3308 
3309         for (uint32_t physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
3310         {
3311             if (!(1 << physDevIdx & physDevMask))
3312                 continue;
3313 
3314             const uint32_t deviceMask = 1 << physDevIdx;
3315 
3316             beginCommandBuffer(vk, *cmdBuffer);
3317             vk.cmdSetDeviceMask(*cmdBuffer, deviceMask);
3318             vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
3319                                   (VkDependencyFlags)0, 0, nullptr, 1, &srcBufferBarrier, 0, nullptr);
3320             vk.cmdCopyBuffer(*cmdBuffer, *sboBuffer, *checkBuffer, 1, &copyParams);
3321             vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
3322                                   (VkDependencyFlags)0, 0, nullptr, 1, &dstBufferBarrier, 0, nullptr);
3323 
3324             endCommandBuffer(vk, *cmdBuffer);
3325             submitCommandsAndWait(vk, device, queue, *cmdBuffer, true, deviceMask);
3326 
3327             const Allocation &bufferAllocation = checkBuffer.getAllocation();
3328             invalidateAlloc(vk, device, bufferAllocation);
3329             const uint32_t *bufferPtr = static_cast<uint32_t *>(bufferAllocation.getHostPtr());
3330 
3331             for (uint32_t ndx = 0; ndx < m_numValues; ++ndx)
3332             {
3333                 const uint32_t res = bufferPtr[ndx];
3334                 const uint32_t ref = constantValPerLoop + uniformInputData[4 * (physDevIdx + 1)];
3335 
3336                 if (res != ref)
3337                 {
3338                     std::ostringstream msg;
3339                     msg << "Comparison failed on physical device " << getPhysicalDevice(physDevIdx) << " ( deviceMask "
3340                         << deviceMask << " ) for InOut.values[" << ndx << "]";
3341                     return tcu::TestStatus::fail(msg.str());
3342                 }
3343             }
3344         }
3345     }
3346 
3347     return tcu::TestStatus::pass("Compute succeeded");
3348 }
3349 
3350 class ConcurrentCompute : public vkt::TestCase
3351 {
3352 public:
3353     ConcurrentCompute(tcu::TestContext &testCtx, const std::string &name,
3354                       const vk::ComputePipelineConstructionType computePipelineConstructionType);
3355 
3356     virtual void checkSupport(Context &context) const;
3357     void initPrograms(SourceCollections &sourceCollections) const;
3358     TestInstance *createInstance(Context &context) const;
3359 
3360     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
3361 };
3362 
3363 class ConcurrentComputeInstance : public vkt::TestInstance
3364 {
3365 public:
3366     ConcurrentComputeInstance(Context &context,
3367                               const vk::ComputePipelineConstructionType computePipelineConstructionType);
3368 
3369     tcu::TestStatus iterate(void);
3370 
3371 private:
3372     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
3373 };
3374 
ConcurrentCompute(tcu::TestContext & testCtx,const std::string & name,const vk::ComputePipelineConstructionType computePipelineConstructionType)3375 ConcurrentCompute::ConcurrentCompute(tcu::TestContext &testCtx, const std::string &name,
3376                                      const vk::ComputePipelineConstructionType computePipelineConstructionType)
3377     : TestCase(testCtx, name)
3378     , m_computePipelineConstructionType(computePipelineConstructionType)
3379 {
3380 }
3381 
checkSupport(Context & context) const3382 void ConcurrentCompute::checkSupport(Context &context) const
3383 {
3384     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
3385                                   m_computePipelineConstructionType);
3386 }
3387 
initPrograms(SourceCollections & sourceCollections) const3388 void ConcurrentCompute::initPrograms(SourceCollections &sourceCollections) const
3389 {
3390     std::ostringstream src;
3391     src << "#version 310 es\n"
3392         << "layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;\n"
3393         << "layout(binding = 0) buffer InOut {\n"
3394         << "    uint values[1024];\n"
3395         << "} sb_inout;\n"
3396         << "void main (void) {\n"
3397         << "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
3398         << "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
3399         << "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + "
3400            "gl_GlobalInvocationID.x;\n"
3401         << "    uint offset          = numValuesPerInv*groupNdx;\n"
3402         << "\n"
3403         << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
3404         << "        sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
3405         << "}\n";
3406 
3407     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
3408 }
3409 
createInstance(Context & context) const3410 TestInstance *ConcurrentCompute::createInstance(Context &context) const
3411 {
3412     return new ConcurrentComputeInstance(context, m_computePipelineConstructionType);
3413 }
3414 
ConcurrentComputeInstance(Context & context,const vk::ComputePipelineConstructionType computePipelineConstructionType)3415 ConcurrentComputeInstance::ConcurrentComputeInstance(
3416     Context &context, const vk::ComputePipelineConstructionType computePipelineConstructionType)
3417     : TestInstance(context)
3418     , m_computePipelineConstructionType(computePipelineConstructionType)
3419 {
3420 }
3421 
iterate(void)3422 tcu::TestStatus ConcurrentComputeInstance::iterate(void)
3423 {
3424     enum
3425     {
3426         NO_MATCH_FOUND = ~((uint32_t)0),
3427         ERROR_NONE     = 0,
3428         ERROR_WAIT     = 1,
3429         ERROR_ORDER    = 2
3430     };
3431 
3432     struct Queues
3433     {
3434         VkQueue queue;
3435         uint32_t queueFamilyIndex;
3436     };
3437 
3438     // const DeviceInterface& vk = m_context.getDeviceInterface();
3439     const uint32_t numValues = 1024;
3440     const CustomInstance instance(createCustomInstanceFromContext(m_context));
3441     const InstanceDriver &instanceDriver(instance.getDriver());
3442     const VkPhysicalDevice physicalDevice =
3443         chooseDevice(instanceDriver, instance, m_context.getTestContext().getCommandLine());
3444     tcu::TestLog &log = m_context.getTestContext().getLog();
3445     vk::Move<vk::VkDevice> logicalDevice;
3446     std::vector<VkQueueFamilyProperties> queueFamilyProperties;
3447     VkDeviceCreateInfo deviceInfo;
3448     VkPhysicalDeviceFeatures2 deviceFeatures2 = initVulkanStructure();
3449     VkPhysicalDeviceFeatures deviceFeatures;
3450     const float queuePriorities[2] = {1.0f, 0.0f};
3451     VkDeviceQueueCreateInfo queueInfos[2];
3452     Queues queues[2] = {{nullptr, (uint32_t)NO_MATCH_FOUND}, {nullptr, (uint32_t)NO_MATCH_FOUND}};
3453 
3454     queueFamilyProperties = getPhysicalDeviceQueueFamilyProperties(instanceDriver, physicalDevice);
3455 
3456     for (uint32_t queueNdx = 0; queueNdx < queueFamilyProperties.size(); ++queueNdx)
3457     {
3458         if (queueFamilyProperties[queueNdx].queueFlags & VK_QUEUE_COMPUTE_BIT)
3459         {
3460             if (NO_MATCH_FOUND == queues[0].queueFamilyIndex)
3461                 queues[0].queueFamilyIndex = queueNdx;
3462 
3463             if (queues[0].queueFamilyIndex != queueNdx || queueFamilyProperties[queueNdx].queueCount > 1u)
3464             {
3465                 queues[1].queueFamilyIndex = queueNdx;
3466                 break;
3467             }
3468         }
3469     }
3470 
3471     if (queues[0].queueFamilyIndex == NO_MATCH_FOUND || queues[1].queueFamilyIndex == NO_MATCH_FOUND)
3472         TCU_THROW(NotSupportedError, "Queues couldn't be created");
3473 
3474     for (int queueNdx = 0; queueNdx < 2; ++queueNdx)
3475     {
3476         VkDeviceQueueCreateInfo queueInfo;
3477         deMemset(&queueInfo, 0, sizeof(queueInfo));
3478 
3479         queueInfo.sType            = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
3480         queueInfo.pNext            = nullptr;
3481         queueInfo.flags            = (VkDeviceQueueCreateFlags)0u;
3482         queueInfo.queueFamilyIndex = queues[queueNdx].queueFamilyIndex;
3483         queueInfo.queueCount       = (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex) ? 2 : 1;
3484         queueInfo.pQueuePriorities = (queueInfo.queueCount == 2) ? queuePriorities : &queuePriorities[queueNdx];
3485 
3486         queueInfos[queueNdx] = queueInfo;
3487 
3488         if (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex)
3489             break;
3490     }
3491 
3492     void *pNext = nullptr;
3493 
3494     deMemset(&deviceInfo, 0, sizeof(deviceInfo));
3495     instanceDriver.getPhysicalDeviceFeatures(physicalDevice, &deviceFeatures);
3496 
3497     deviceFeatures2.features = deviceFeatures;
3498 
3499     std::vector<const char *> deviceExtensions;
3500 
3501 #ifndef CTS_USES_VULKANSC
3502     VkPhysicalDeviceDynamicRenderingFeaturesKHR dynamicRenderingFeatures = initVulkanStructure();
3503     dynamicRenderingFeatures.dynamicRendering                            = VK_TRUE;
3504     VkPhysicalDeviceShaderObjectFeaturesEXT shaderObjectFeatures = initVulkanStructure(&dynamicRenderingFeatures);
3505     shaderObjectFeatures.shaderObject                            = VK_TRUE;
3506 
3507     if (m_computePipelineConstructionType != COMPUTE_PIPELINE_CONSTRUCTION_TYPE_PIPELINE)
3508     {
3509         deviceExtensions.push_back("VK_EXT_shader_object");
3510         deviceFeatures2.pNext = &shaderObjectFeatures;
3511         pNext                 = &deviceFeatures2;
3512     }
3513 #endif
3514 
3515 #ifdef CTS_USES_VULKANSC
3516     VkDeviceObjectReservationCreateInfo memReservationInfo =
3517         m_context.getTestContext().getCommandLine().isSubProcess() ? m_context.getResourceInterface()->getStatMax() :
3518                                                                      resetDeviceObjectReservationCreateInfo();
3519     memReservationInfo.pNext = pNext;
3520     pNext                    = &memReservationInfo;
3521 
3522     VkPhysicalDeviceVulkanSC10Features sc10Features = createDefaultSC10Features();
3523     sc10Features.pNext                              = pNext;
3524     pNext                                           = &sc10Features;
3525 
3526     VkPipelineCacheCreateInfo pcCI;
3527     std::vector<VkPipelinePoolSize> poolSizes;
3528     if (m_context.getTestContext().getCommandLine().isSubProcess())
3529     {
3530         if (m_context.getResourceInterface()->getCacheDataSize() > 0)
3531         {
3532             pcCI = {
3533                 VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO, // VkStructureType sType;
3534                 nullptr,                                      // const void* pNext;
3535                 VK_PIPELINE_CACHE_CREATE_READ_ONLY_BIT |
3536                     VK_PIPELINE_CACHE_CREATE_USE_APPLICATION_STORAGE_BIT, // VkPipelineCacheCreateFlags flags;
3537                 m_context.getResourceInterface()->getCacheDataSize(),     // uintptr_t initialDataSize;
3538                 m_context.getResourceInterface()->getCacheData()          // const void* pInitialData;
3539             };
3540             memReservationInfo.pipelineCacheCreateInfoCount = 1;
3541             memReservationInfo.pPipelineCacheCreateInfos    = &pcCI;
3542         }
3543 
3544         poolSizes = m_context.getResourceInterface()->getPipelinePoolSizes();
3545         if (!poolSizes.empty())
3546         {
3547             memReservationInfo.pipelinePoolSizeCount = uint32_t(poolSizes.size());
3548             memReservationInfo.pPipelinePoolSizes    = poolSizes.data();
3549         }
3550     }
3551 #endif // CTS_USES_VULKANSC
3552 
3553     deviceInfo.sType                   = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
3554     deviceInfo.pNext                   = pNext;
3555     deviceInfo.enabledExtensionCount   = (uint32_t)deviceExtensions.size();
3556     deviceInfo.ppEnabledExtensionNames = deviceExtensions.data();
3557     deviceInfo.enabledLayerCount       = 0u;
3558     deviceInfo.ppEnabledLayerNames     = nullptr;
3559     deviceInfo.pEnabledFeatures        = (deviceFeatures2.pNext == nullptr) ? &deviceFeatures : nullptr;
3560     deviceInfo.queueCreateInfoCount    = (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex) ? 1 : 2;
3561     deviceInfo.pQueueCreateInfos       = queueInfos;
3562 
3563     logicalDevice =
3564         createCustomDevice(m_context.getTestContext().getCommandLine().isValidationEnabled(),
3565                            m_context.getPlatformInterface(), instance, instanceDriver, physicalDevice, &deviceInfo);
3566 
3567 #ifndef CTS_USES_VULKANSC
3568     de::MovePtr<vk::DeviceDriver> deviceDriver = de::MovePtr<DeviceDriver>(
3569         new DeviceDriver(m_context.getPlatformInterface(), instance, *logicalDevice, m_context.getUsedApiVersion(),
3570                          m_context.getTestContext().getCommandLine()));
3571 #else
3572     de::MovePtr<vk::DeviceDriverSC, vk::DeinitDeviceDeleter> deviceDriver =
3573         de::MovePtr<DeviceDriverSC, DeinitDeviceDeleter>(
3574             new DeviceDriverSC(m_context.getPlatformInterface(), instance, *logicalDevice,
3575                                m_context.getTestContext().getCommandLine(), m_context.getResourceInterface(),
3576                                m_context.getDeviceVulkanSC10Properties(), m_context.getDeviceProperties(),
3577                                m_context.getUsedApiVersion()),
3578             vk::DeinitDeviceDeleter(m_context.getResourceInterface().get(), *logicalDevice));
3579 #endif // CTS_USES_VULKANSC
3580     vk::DeviceInterface &vk = *deviceDriver;
3581 
3582     for (uint32_t queueReqNdx = 0; queueReqNdx < 2; ++queueReqNdx)
3583     {
3584         if (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex)
3585             vk.getDeviceQueue(*logicalDevice, queues[queueReqNdx].queueFamilyIndex, queueReqNdx,
3586                               &queues[queueReqNdx].queue);
3587         else
3588             vk.getDeviceQueue(*logicalDevice, queues[queueReqNdx].queueFamilyIndex, 0u, &queues[queueReqNdx].queue);
3589     }
3590 
3591     // Create an input/output buffers
3592     const VkPhysicalDeviceMemoryProperties memoryProperties =
3593         vk::getPhysicalDeviceMemoryProperties(instanceDriver, physicalDevice);
3594 
3595     de::MovePtr<SimpleAllocator> allocator =
3596         de::MovePtr<SimpleAllocator>(new SimpleAllocator(vk, *logicalDevice, memoryProperties));
3597     const VkDeviceSize bufferSizeBytes = sizeof(uint32_t) * numValues;
3598     const BufferWithMemory buffer1(vk, *logicalDevice, *allocator,
3599                                    makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
3600                                    MemoryRequirement::HostVisible);
3601     const BufferWithMemory buffer2(vk, *logicalDevice, *allocator,
3602                                    makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
3603                                    MemoryRequirement::HostVisible);
3604 
3605     // Fill the buffers with data
3606 
3607     typedef std::vector<uint32_t> data_vector_t;
3608     data_vector_t inputData(numValues);
3609 
3610     {
3611         de::Random rnd(0x82ce7f);
3612         const Allocation &bufferAllocation1 = buffer1.getAllocation();
3613         const Allocation &bufferAllocation2 = buffer2.getAllocation();
3614         uint32_t *bufferPtr1                = static_cast<uint32_t *>(bufferAllocation1.getHostPtr());
3615         uint32_t *bufferPtr2                = static_cast<uint32_t *>(bufferAllocation2.getHostPtr());
3616 
3617         for (uint32_t i = 0; i < numValues; ++i)
3618         {
3619             uint32_t val  = rnd.getUint32();
3620             inputData[i]  = val;
3621             *bufferPtr1++ = val;
3622             *bufferPtr2++ = val;
3623         }
3624 
3625         flushAlloc(vk, *logicalDevice, bufferAllocation1);
3626         flushAlloc(vk, *logicalDevice, bufferAllocation2);
3627     }
3628 
3629     // Create descriptor sets
3630 
3631     const Unique<VkDescriptorSetLayout> descriptorSetLayout1(
3632         DescriptorSetLayoutBuilder()
3633             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3634             .build(vk, *logicalDevice));
3635 
3636     const Unique<VkDescriptorPool> descriptorPool1(
3637         DescriptorPoolBuilder()
3638             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
3639             .build(vk, *logicalDevice, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
3640 
3641     const Unique<VkDescriptorSet> descriptorSet1(
3642         makeDescriptorSet(vk, *logicalDevice, *descriptorPool1, *descriptorSetLayout1));
3643 
3644     const VkDescriptorBufferInfo bufferDescriptorInfo1 = makeDescriptorBufferInfo(*buffer1, 0ull, bufferSizeBytes);
3645     DescriptorSetUpdateBuilder()
3646         .writeSingle(*descriptorSet1, DescriptorSetUpdateBuilder::Location::binding(0u),
3647                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo1)
3648         .update(vk, *logicalDevice);
3649 
3650     const Unique<VkDescriptorSetLayout> descriptorSetLayout2(
3651         DescriptorSetLayoutBuilder()
3652             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3653             .build(vk, *logicalDevice));
3654 
3655     const Unique<VkDescriptorPool> descriptorPool2(
3656         DescriptorPoolBuilder()
3657             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
3658             .build(vk, *logicalDevice, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
3659 
3660     const Unique<VkDescriptorSet> descriptorSet2(
3661         makeDescriptorSet(vk, *logicalDevice, *descriptorPool2, *descriptorSetLayout2));
3662 
3663     const VkDescriptorBufferInfo bufferDescriptorInfo2 = makeDescriptorBufferInfo(*buffer2, 0ull, bufferSizeBytes);
3664     DescriptorSetUpdateBuilder()
3665         .writeSingle(*descriptorSet2, DescriptorSetUpdateBuilder::Location::binding(0u),
3666                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo2)
3667         .update(vk, *logicalDevice);
3668 
3669     // Perform the computation
3670 
3671     const Unique<VkShaderModule> shaderModule(
3672         createShaderModule(vk, *logicalDevice, m_context.getBinaryCollection().get("comp"), 0u));
3673 
3674     ComputePipelineWrapper pipeline1(vk, *logicalDevice, m_computePipelineConstructionType,
3675                                      m_context.getBinaryCollection().get("comp"));
3676     pipeline1.setDescriptorSetLayout(*descriptorSetLayout1);
3677     pipeline1.buildPipeline();
3678     const VkBufferMemoryBarrier hostWriteBarrier1 =
3679         makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer1, 0ull, bufferSizeBytes);
3680     const VkBufferMemoryBarrier shaderWriteBarrier1 =
3681         makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer1, 0ull, bufferSizeBytes);
3682     const Unique<VkCommandPool> cmdPool1(makeCommandPool(vk, *logicalDevice, queues[0].queueFamilyIndex));
3683     const Unique<VkCommandBuffer> cmdBuffer1(
3684         allocateCommandBuffer(vk, *logicalDevice, *cmdPool1, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3685 
3686     ComputePipelineWrapper pipeline2(vk, *logicalDevice, m_computePipelineConstructionType,
3687                                      m_context.getBinaryCollection().get("comp"));
3688     pipeline2.setDescriptorSetLayout(*descriptorSetLayout2);
3689     pipeline2.buildPipeline();
3690     const VkBufferMemoryBarrier hostWriteBarrier2 =
3691         makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer2, 0ull, bufferSizeBytes);
3692     const VkBufferMemoryBarrier shaderWriteBarrier2 =
3693         makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer2, 0ull, bufferSizeBytes);
3694     const Unique<VkCommandPool> cmdPool2(makeCommandPool(vk, *logicalDevice, queues[1].queueFamilyIndex));
3695     const Unique<VkCommandBuffer> cmdBuffer2(
3696         allocateCommandBuffer(vk, *logicalDevice, *cmdPool2, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3697 
3698     // Command buffer 1
3699 
3700     beginCommandBuffer(vk, *cmdBuffer1);
3701     pipeline1.bind(*cmdBuffer1);
3702     vk.cmdBindDescriptorSets(*cmdBuffer1, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline1.getPipelineLayout(), 0u, 1u,
3703                              &descriptorSet1.get(), 0u, nullptr);
3704     vk.cmdPipelineBarrier(*cmdBuffer1, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
3705                           (VkDependencyFlags)0, 0, nullptr, 1, &hostWriteBarrier1, 0, nullptr);
3706     vk.cmdDispatch(*cmdBuffer1, 1, 1, 1);
3707     vk.cmdPipelineBarrier(*cmdBuffer1, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
3708                           (VkDependencyFlags)0, 0, nullptr, 1, &shaderWriteBarrier1, 0, nullptr);
3709     endCommandBuffer(vk, *cmdBuffer1);
3710 
3711     // Command buffer 2
3712 
3713     beginCommandBuffer(vk, *cmdBuffer2);
3714     pipeline2.bind(*cmdBuffer2);
3715     vk.cmdBindDescriptorSets(*cmdBuffer2, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline2.getPipelineLayout(), 0u, 1u,
3716                              &descriptorSet2.get(), 0u, nullptr);
3717     vk.cmdPipelineBarrier(*cmdBuffer2, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
3718                           (VkDependencyFlags)0, 0, nullptr, 1, &hostWriteBarrier2, 0, nullptr);
3719     vk.cmdDispatch(*cmdBuffer2, 1, 1, 1);
3720     vk.cmdPipelineBarrier(*cmdBuffer2, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
3721                           (VkDependencyFlags)0, 0, nullptr, 1, &shaderWriteBarrier2, 0, nullptr);
3722     endCommandBuffer(vk, *cmdBuffer2);
3723 
3724     VkSubmitInfo submitInfo1 = {
3725         VK_STRUCTURE_TYPE_SUBMIT_INFO, // sType
3726         nullptr,                       // pNext
3727         0u,                            // waitSemaphoreCount
3728         nullptr,                       // pWaitSemaphores
3729         nullptr,                       // pWaitDstStageMask
3730         1u,                            // commandBufferCount
3731         &cmdBuffer1.get(),             // pCommandBuffers
3732         0u,                            // signalSemaphoreCount
3733         nullptr                        // pSignalSemaphores
3734     };
3735 
3736     VkSubmitInfo submitInfo2 = {
3737         VK_STRUCTURE_TYPE_SUBMIT_INFO, // sType
3738         nullptr,                       // pNext
3739         0u,                            // waitSemaphoreCount
3740         nullptr,                       // pWaitSemaphores
3741         nullptr,                       // pWaitDstStageMask
3742         1u,                            // commandBufferCount
3743         &cmdBuffer2.get(),             // pCommandBuffers
3744         0u,                            // signalSemaphoreCount
3745         nullptr                        // pSignalSemaphores
3746     };
3747 
3748     // Wait for completion
3749     const Unique<VkFence> fence1(createFence(vk, *logicalDevice));
3750     const Unique<VkFence> fence2(createFence(vk, *logicalDevice));
3751 
3752     VK_CHECK(vk.queueSubmit(queues[0].queue, 1u, &submitInfo1, *fence1));
3753     VK_CHECK(vk.queueSubmit(queues[1].queue, 1u, &submitInfo2, *fence2));
3754 
3755     int err = ERROR_NONE;
3756 
3757     // First wait for the low-priority queue
3758     if (VK_SUCCESS != vk.waitForFences(*logicalDevice, 1u, &fence2.get(), true, ~0ull))
3759         err = ERROR_WAIT;
3760 
3761     // If the high-priority queue hasn't finished, we have a problem.
3762     if (VK_SUCCESS != vk.getFenceStatus(*logicalDevice, fence1.get()))
3763         if (err == ERROR_NONE)
3764             err = ERROR_ORDER;
3765 
3766     // Wait for the high-priority fence so we don't get errors on teardown.
3767     vk.waitForFences(*logicalDevice, 1u, &fence1.get(), true, ~0ull);
3768 
3769     // If we fail() before waiting for all of the fences, error will come from
3770     // teardown instead of the error we want.
3771 
3772     if (err == ERROR_WAIT)
3773     {
3774         return tcu::TestStatus::fail("Failed waiting for low-priority queue fence.");
3775     }
3776 
3777     // Validate the results
3778 
3779     const Allocation &bufferAllocation1 = buffer1.getAllocation();
3780     invalidateAlloc(vk, *logicalDevice, bufferAllocation1);
3781     const uint32_t *bufferPtr1 = static_cast<uint32_t *>(bufferAllocation1.getHostPtr());
3782 
3783     const Allocation &bufferAllocation2 = buffer2.getAllocation();
3784     invalidateAlloc(vk, *logicalDevice, bufferAllocation2);
3785     const uint32_t *bufferPtr2 = static_cast<uint32_t *>(bufferAllocation2.getHostPtr());
3786 
3787     for (uint32_t ndx = 0; ndx < numValues; ++ndx)
3788     {
3789         const uint32_t res1 = bufferPtr1[ndx];
3790         const uint32_t res2 = bufferPtr2[ndx];
3791         const uint32_t inp  = inputData[ndx];
3792         const uint32_t ref  = ~inp;
3793 
3794         if (res1 != ref || res1 != res2)
3795         {
3796             std::ostringstream msg;
3797             msg << "Comparison failed for InOut.values[" << ndx << "] ref:" << ref << " res1:" << res1
3798                 << " res2:" << res2 << " inp:" << inp;
3799             return tcu::TestStatus::fail(msg.str());
3800         }
3801     }
3802 
3803     if (err == ERROR_ORDER)
3804     {
3805         log << tcu::TestLog::Message
3806             << "Note: Low-priority queue was faster than high-priority one. This is not an error, but priorities may "
3807                "be inverted."
3808             << tcu::TestLog::EndMessage;
3809     }
3810 
3811     return tcu::TestStatus::pass("Test passed");
3812 }
3813 
3814 class EmptyWorkGroupCase : public vkt::TestCase
3815 {
3816 public:
3817     EmptyWorkGroupCase(tcu::TestContext &testCtx, const std::string &name, const tcu::UVec3 &dispatchSize,
3818                        const vk::ComputePipelineConstructionType computePipelineConstructionType);
~EmptyWorkGroupCase(void)3819     virtual ~EmptyWorkGroupCase(void)
3820     {
3821     }
3822 
3823     virtual void checkSupport(Context &context) const override;
3824     TestInstance *createInstance(Context &context) const override;
3825     void initPrograms(vk::SourceCollections &programCollection) const override;
3826 
3827 protected:
3828     const tcu::UVec3 m_dispatchSize;
3829     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
3830 };
3831 
3832 class EmptyWorkGroupInstance : public vkt::TestInstance
3833 {
3834 public:
EmptyWorkGroupInstance(Context & context,const tcu::UVec3 & dispatchSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)3835     EmptyWorkGroupInstance(Context &context, const tcu::UVec3 &dispatchSize,
3836                            const vk::ComputePipelineConstructionType computePipelineConstructionType)
3837         : vkt::TestInstance(context)
3838         , m_dispatchSize(dispatchSize)
3839         , m_computePipelineConstructionType(computePipelineConstructionType)
3840     {
3841     }
~EmptyWorkGroupInstance(void)3842     virtual ~EmptyWorkGroupInstance(void)
3843     {
3844     }
3845 
3846     tcu::TestStatus iterate(void) override;
3847 
3848 protected:
3849     const tcu::UVec3 m_dispatchSize;
3850     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
3851 };
3852 
EmptyWorkGroupCase(tcu::TestContext & testCtx,const std::string & name,const tcu::UVec3 & dispatchSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)3853 EmptyWorkGroupCase::EmptyWorkGroupCase(tcu::TestContext &testCtx, const std::string &name,
3854                                        const tcu::UVec3 &dispatchSize,
3855                                        const vk::ComputePipelineConstructionType computePipelineConstructionType)
3856     : vkt::TestCase(testCtx, name)
3857     , m_dispatchSize(dispatchSize)
3858     , m_computePipelineConstructionType(computePipelineConstructionType)
3859 {
3860     DE_ASSERT(m_dispatchSize.x() == 0u || m_dispatchSize.y() == 0u || m_dispatchSize.z() == 0u);
3861 }
3862 
checkSupport(Context & context) const3863 void EmptyWorkGroupCase::checkSupport(Context &context) const
3864 {
3865     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
3866                                   m_computePipelineConstructionType);
3867 }
3868 
createInstance(Context & context) const3869 TestInstance *EmptyWorkGroupCase::createInstance(Context &context) const
3870 {
3871     return new EmptyWorkGroupInstance(context, m_dispatchSize, m_computePipelineConstructionType);
3872 }
3873 
initPrograms(vk::SourceCollections & programCollection) const3874 void EmptyWorkGroupCase::initPrograms(vk::SourceCollections &programCollection) const
3875 {
3876     std::ostringstream comp;
3877     comp << "#version 450\n"
3878          << "layout (local_size_x=1, local_size_y=1, local_size_z=1) in;\n"
3879          << "layout (set=0, binding=0) buffer VerificationBlock { uint value; } verif;\n"
3880          << "void main () { atomicAdd(verif.value, 1u); }\n";
3881     programCollection.glslSources.add("comp") << glu::ComputeSource(comp.str());
3882 }
3883 
iterate(void)3884 tcu::TestStatus EmptyWorkGroupInstance::iterate(void)
3885 {
3886     const auto &vkd       = m_context.getDeviceInterface();
3887     const auto device     = m_context.getDevice();
3888     auto &alloc           = m_context.getDefaultAllocator();
3889     const auto queueIndex = m_context.getUniversalQueueFamilyIndex();
3890     const auto queue      = m_context.getUniversalQueue();
3891 
3892     const auto verifBufferSize = static_cast<VkDeviceSize>(sizeof(uint32_t));
3893     const auto verifBufferInfo = makeBufferCreateInfo(verifBufferSize, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
3894     BufferWithMemory verifBuffer(vkd, device, alloc, verifBufferInfo, MemoryRequirement::HostVisible);
3895     auto &verifBufferAlloc = verifBuffer.getAllocation();
3896     void *verifBufferPtr   = verifBufferAlloc.getHostPtr();
3897 
3898     deMemset(verifBufferPtr, 0, static_cast<size_t>(verifBufferSize));
3899     flushAlloc(vkd, device, verifBufferAlloc);
3900 
3901     DescriptorSetLayoutBuilder layoutBuilder;
3902     layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT);
3903     const auto descriptorSetLayout = layoutBuilder.build(vkd, device);
3904 
3905     ComputePipelineWrapper pipeline(vkd, device, m_computePipelineConstructionType,
3906                                     m_context.getBinaryCollection().get("comp"));
3907     pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
3908     pipeline.buildPipeline();
3909 
3910     DescriptorPoolBuilder poolBuilder;
3911     poolBuilder.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
3912     const auto descriptorPool = poolBuilder.build(vkd, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u);
3913     const auto descriptorSet  = makeDescriptorSet(vkd, device, descriptorPool.get(), descriptorSetLayout.get());
3914 
3915     DescriptorSetUpdateBuilder updateBuilder;
3916     const auto verifBufferDescInfo = makeDescriptorBufferInfo(verifBuffer.get(), 0ull, verifBufferSize);
3917     updateBuilder.writeSingle(descriptorSet.get(), DescriptorSetUpdateBuilder::Location::binding(0u),
3918                               VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &verifBufferDescInfo);
3919     updateBuilder.update(vkd, device);
3920 
3921     const auto cmdPool      = makeCommandPool(vkd, device, queueIndex);
3922     const auto cmdBufferPtr = allocateCommandBuffer(vkd, device, cmdPool.get(), VK_COMMAND_BUFFER_LEVEL_PRIMARY);
3923     const auto cmdBuffer    = cmdBufferPtr.get();
3924 
3925     beginCommandBuffer(vkd, cmdBuffer);
3926     pipeline.bind(cmdBuffer);
3927     vkd.cmdBindDescriptorSets(cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
3928                               &descriptorSet.get(), 0u, nullptr);
3929     vkd.cmdDispatch(cmdBuffer, m_dispatchSize.x(), m_dispatchSize.y(), m_dispatchSize.z());
3930 
3931     const auto readWriteAccess  = (VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_SHADER_READ_BIT);
3932     const auto computeToCompute = makeMemoryBarrier(readWriteAccess, readWriteAccess);
3933     vkd.cmdPipelineBarrier(cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0U,
3934                            1u, &computeToCompute, 0u, nullptr, 0u, nullptr);
3935 
3936     vkd.cmdDispatch(cmdBuffer, 1u, 1u, 1u);
3937 
3938     const auto computeToHost = makeMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT);
3939     vkd.cmdPipelineBarrier(cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0u, 1u,
3940                            &computeToHost, 0u, nullptr, 0u, nullptr);
3941 
3942     endCommandBuffer(vkd, cmdBuffer);
3943     submitCommandsAndWait(vkd, device, queue, cmdBuffer);
3944 
3945     uint32_t value;
3946     invalidateAlloc(vkd, device, verifBufferAlloc);
3947     deMemcpy(&value, verifBufferPtr, sizeof(value));
3948 
3949     if (value != 1u)
3950     {
3951         std::ostringstream msg;
3952         msg << "Unexpected value found in buffer: " << value << " while expecting 1";
3953         TCU_FAIL(msg.str());
3954     }
3955 
3956     return tcu::TestStatus::pass("Pass");
3957 }
3958 
3959 class MaxWorkGroupSizeTest : public vkt::TestCase
3960 {
3961 public:
3962     enum class Axis
3963     {
3964         X = 0,
3965         Y = 1,
3966         Z = 2
3967     };
3968 
3969     struct Params
3970     {
3971         // Which axis to maximize.
3972         Axis axis;
3973     };
3974 
3975     MaxWorkGroupSizeTest(tcu::TestContext &testCtx, const std::string &name, const Params &params,
3976                          const vk::ComputePipelineConstructionType computePipelineConstructionType);
~MaxWorkGroupSizeTest(void)3977     virtual ~MaxWorkGroupSizeTest(void)
3978     {
3979     }
3980 
3981     virtual void initPrograms(vk::SourceCollections &programCollection) const;
3982     virtual TestInstance *createInstance(Context &context) const;
3983     virtual void checkSupport(Context &context) const;
3984 
3985     // Helper to transform the axis value to an index.
3986     static int getIndex(Axis axis);
3987 
3988     // Helper returning the number of invocations according to the test parameters.
3989     static uint32_t getInvocations(const Params &params, const vk::InstanceInterface &vki,
3990                                    vk::VkPhysicalDevice physicalDevice,
3991                                    const vk::VkPhysicalDeviceProperties *devProperties = nullptr);
3992 
3993     // Helper returning the buffer size needed to this test.
3994     static uint32_t getSSBOSize(uint32_t invocations);
3995 
3996 private:
3997     Params m_params;
3998     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
3999 };
4000 
4001 class MaxWorkGroupSizeInstance : public vkt::TestInstance
4002 {
4003 public:
4004     MaxWorkGroupSizeInstance(Context &context, const MaxWorkGroupSizeTest::Params &params,
4005                              const vk::ComputePipelineConstructionType computePipelineConstructionType);
~MaxWorkGroupSizeInstance(void)4006     virtual ~MaxWorkGroupSizeInstance(void)
4007     {
4008     }
4009 
4010     virtual tcu::TestStatus iterate(void);
4011 
4012 private:
4013     MaxWorkGroupSizeTest::Params m_params;
4014     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
4015 };
4016 
getIndex(Axis axis)4017 int MaxWorkGroupSizeTest::getIndex(Axis axis)
4018 {
4019     const int ret = static_cast<int>(axis);
4020     DE_ASSERT(ret >= static_cast<int>(Axis::X) && ret <= static_cast<int>(Axis::Z));
4021     return ret;
4022 }
4023 
getInvocations(const Params & params,const vk::InstanceInterface & vki,vk::VkPhysicalDevice physicalDevice,const vk::VkPhysicalDeviceProperties * devProperties)4024 uint32_t MaxWorkGroupSizeTest::getInvocations(const Params &params, const vk::InstanceInterface &vki,
4025                                               vk::VkPhysicalDevice physicalDevice,
4026                                               const vk::VkPhysicalDeviceProperties *devProperties)
4027 {
4028     const auto axis = getIndex(params.axis);
4029 
4030     if (devProperties)
4031         return devProperties->limits.maxComputeWorkGroupSize[axis];
4032     return vk::getPhysicalDeviceProperties(vki, physicalDevice).limits.maxComputeWorkGroupSize[axis];
4033 }
4034 
getSSBOSize(uint32_t invocations)4035 uint32_t MaxWorkGroupSizeTest::getSSBOSize(uint32_t invocations)
4036 {
4037     return invocations * static_cast<uint32_t>(sizeof(uint32_t));
4038 }
4039 
MaxWorkGroupSizeTest(tcu::TestContext & testCtx,const std::string & name,const Params & params,const vk::ComputePipelineConstructionType computePipelineConstructionType)4040 MaxWorkGroupSizeTest::MaxWorkGroupSizeTest(tcu::TestContext &testCtx, const std::string &name, const Params &params,
4041                                            const vk::ComputePipelineConstructionType computePipelineConstructionType)
4042     : vkt::TestCase(testCtx, name)
4043     , m_params(params)
4044     , m_computePipelineConstructionType(computePipelineConstructionType)
4045 {
4046 }
4047 
initPrograms(vk::SourceCollections & programCollection) const4048 void MaxWorkGroupSizeTest::initPrograms(vk::SourceCollections &programCollection) const
4049 {
4050     std::ostringstream shader;
4051 
4052     // The actual local sizes will be set using spec constants when running the test instance.
4053     shader << "#version 450\n"
4054            << "\n"
4055            << "layout(constant_id=0) const int local_size_x_val = 1;\n"
4056            << "layout(constant_id=1) const int local_size_y_val = 1;\n"
4057            << "layout(constant_id=2) const int local_size_z_val = 1;\n"
4058            << "\n"
4059            << "layout(local_size_x_id=0, local_size_y_id=1, local_size_z_id=2) in;\n"
4060            << "\n"
4061            << "layout(set=0, binding=0) buffer StorageBuffer {\n"
4062            << "    uint values[];\n"
4063            << "} ssbo;\n"
4064            << "\n"
4065            << "void main() {\n"
4066            << "    ssbo.values[gl_LocalInvocationIndex] = 1u;\n"
4067            << "}\n";
4068 
4069     programCollection.glslSources.add("comp") << glu::ComputeSource(shader.str());
4070 }
4071 
createInstance(Context & context) const4072 TestInstance *MaxWorkGroupSizeTest::createInstance(Context &context) const
4073 {
4074     return new MaxWorkGroupSizeInstance(context, m_params, m_computePipelineConstructionType);
4075 }
4076 
checkSupport(Context & context) const4077 void MaxWorkGroupSizeTest::checkSupport(Context &context) const
4078 {
4079     const auto &vki           = context.getInstanceInterface();
4080     const auto physicalDevice = context.getPhysicalDevice();
4081 
4082     const auto properties  = vk::getPhysicalDeviceProperties(vki, physicalDevice);
4083     const auto invocations = getInvocations(m_params, vki, physicalDevice, &properties);
4084 
4085     if (invocations > properties.limits.maxComputeWorkGroupInvocations)
4086         TCU_FAIL("Reported workgroup size limit in the axis is greater than the global invocation limit");
4087 
4088     if (properties.limits.maxStorageBufferRange / static_cast<uint32_t>(sizeof(uint32_t)) < invocations)
4089         TCU_THROW(NotSupportedError, "Maximum supported storage buffer range too small");
4090 
4091     checkShaderObjectRequirements(vki, physicalDevice, m_computePipelineConstructionType);
4092 }
4093 
MaxWorkGroupSizeInstance(Context & context,const MaxWorkGroupSizeTest::Params & params,const vk::ComputePipelineConstructionType computePipelineConstructionType)4094 MaxWorkGroupSizeInstance::MaxWorkGroupSizeInstance(
4095     Context &context, const MaxWorkGroupSizeTest::Params &params,
4096     const vk::ComputePipelineConstructionType computePipelineConstructionType)
4097     : vkt::TestInstance(context)
4098     , m_params(params)
4099     , m_computePipelineConstructionType(computePipelineConstructionType)
4100 {
4101 }
4102 
iterate(void)4103 tcu::TestStatus MaxWorkGroupSizeInstance::iterate(void)
4104 {
4105     const auto &vki           = m_context.getInstanceInterface();
4106     const auto &vkd           = m_context.getDeviceInterface();
4107     const auto physicalDevice = m_context.getPhysicalDevice();
4108     const auto device         = m_context.getDevice();
4109     auto &alloc               = m_context.getDefaultAllocator();
4110     const auto queueIndex     = m_context.getUniversalQueueFamilyIndex();
4111     const auto queue          = m_context.getUniversalQueue();
4112     auto &log                 = m_context.getTestContext().getLog();
4113 
4114     const auto axis        = MaxWorkGroupSizeTest::getIndex(m_params.axis);
4115     const auto invocations = MaxWorkGroupSizeTest::getInvocations(m_params, vki, physicalDevice);
4116     const auto ssboSize    = static_cast<vk::VkDeviceSize>(MaxWorkGroupSizeTest::getSSBOSize(invocations));
4117 
4118     log << tcu::TestLog::Message << "Running test with " << invocations << " invocations on axis " << axis
4119         << " using a storage buffer size of " << ssboSize << " bytes" << tcu::TestLog::EndMessage;
4120 
4121     // Main SSBO buffer.
4122     const auto ssboInfo = vk::makeBufferCreateInfo(ssboSize, vk::VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
4123     vk::BufferWithMemory ssbo(vkd, device, alloc, ssboInfo, vk::MemoryRequirement::HostVisible);
4124 
4125     // Descriptor set layouts.
4126     vk::DescriptorSetLayoutBuilder layoutBuilder;
4127     layoutBuilder.addSingleBinding(vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, vk::VK_SHADER_STAGE_COMPUTE_BIT);
4128     const auto descriptorSetLayout = layoutBuilder.build(vkd, device);
4129 
4130     // Specialization constants: set the number of invocations in the appropriate local size id.
4131     const auto entrySize          = static_cast<uintptr_t>(sizeof(int32_t));
4132     int32_t specializationData[3] = {1, 1, 1};
4133     specializationData[axis]      = static_cast<int32_t>(invocations);
4134 
4135     const vk::VkSpecializationMapEntry specializationMaps[3] = {
4136         {
4137             0u,        // uint32_t constantID;
4138             0u,        // uint32_t offset;
4139             entrySize, // uintptr_t size;
4140         },
4141         {
4142             1u,                               // uint32_t constantID;
4143             static_cast<uint32_t>(entrySize), // uint32_t offset;
4144             entrySize,                        // uintptr_t size;
4145         },
4146         {
4147             2u,                                    // uint32_t constantID;
4148             static_cast<uint32_t>(entrySize * 2u), // uint32_t offset;
4149             entrySize,                             // uintptr_t size;
4150         },
4151     };
4152 
4153     const vk::VkSpecializationInfo specializationInfo = {
4154         3u,                                                 // uint32_t mapEntryCount;
4155         specializationMaps,                                 // const VkSpecializationMapEntry* pMapEntries;
4156         static_cast<uintptr_t>(sizeof(specializationData)), // uintptr_t dataSize;
4157         specializationData,                                 // const void* pData;
4158     };
4159 
4160     ComputePipelineWrapper testPipeline(vkd, device, m_computePipelineConstructionType,
4161                                         m_context.getBinaryCollection().get("comp"));
4162     testPipeline.setDescriptorSetLayout(descriptorSetLayout.get());
4163     testPipeline.setSpecializationInfo(specializationInfo);
4164     testPipeline.buildPipeline();
4165 
4166     // Create descriptor pool and set.
4167     vk::DescriptorPoolBuilder poolBuilder;
4168     poolBuilder.addType(vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
4169     const auto descriptorPool =
4170         poolBuilder.build(vkd, device, vk::VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u);
4171     const auto descriptorSet = vk::makeDescriptorSet(vkd, device, descriptorPool.get(), descriptorSetLayout.get());
4172 
4173     // Update descriptor set.
4174     const vk::VkDescriptorBufferInfo ssboBufferInfo = {
4175         ssbo.get(),    // VkBuffer buffer;
4176         0u,            // VkDeviceSize offset;
4177         VK_WHOLE_SIZE, // VkDeviceSize range;
4178     };
4179 
4180     vk::DescriptorSetUpdateBuilder updateBuilder;
4181     updateBuilder.writeSingle(descriptorSet.get(), vk::DescriptorSetUpdateBuilder::Location::binding(0u),
4182                               vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &ssboBufferInfo);
4183     updateBuilder.update(vkd, device);
4184 
4185     // Clear buffer.
4186     auto &ssboAlloc = ssbo.getAllocation();
4187     void *ssboPtr   = ssboAlloc.getHostPtr();
4188     deMemset(ssboPtr, 0, static_cast<size_t>(ssboSize));
4189     vk::flushAlloc(vkd, device, ssboAlloc);
4190 
4191     // Run pipelines.
4192     const auto cmdPool = vk::makeCommandPool(vkd, device, queueIndex);
4193     const auto cmdBUfferPtr =
4194         vk::allocateCommandBuffer(vkd, device, cmdPool.get(), vk::VK_COMMAND_BUFFER_LEVEL_PRIMARY);
4195     const auto cmdBuffer = cmdBUfferPtr.get();
4196 
4197     vk::beginCommandBuffer(vkd, cmdBuffer);
4198 
4199     // Run the main test shader.
4200     const auto hostToComputeBarrier = vk::makeBufferMemoryBarrier(
4201         vk::VK_ACCESS_HOST_WRITE_BIT, vk::VK_ACCESS_SHADER_WRITE_BIT, ssbo.get(), 0ull, VK_WHOLE_SIZE);
4202     vkd.cmdPipelineBarrier(cmdBuffer, vk::VK_PIPELINE_STAGE_HOST_BIT, vk::VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0u, 0u,
4203                            nullptr, 1u, &hostToComputeBarrier, 0u, nullptr);
4204 
4205     testPipeline.bind(cmdBuffer);
4206     vkd.cmdBindDescriptorSets(cmdBuffer, vk::VK_PIPELINE_BIND_POINT_COMPUTE, testPipeline.getPipelineLayout(), 0u, 1u,
4207                               &descriptorSet.get(), 0u, nullptr);
4208     vkd.cmdDispatch(cmdBuffer, 1u, 1u, 1u);
4209 
4210     const auto computeToHostBarrier = vk::makeBufferMemoryBarrier(
4211         vk::VK_ACCESS_SHADER_WRITE_BIT, vk::VK_ACCESS_HOST_READ_BIT, ssbo.get(), 0ull, VK_WHOLE_SIZE);
4212     vkd.cmdPipelineBarrier(cmdBuffer, vk::VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, vk::VK_PIPELINE_STAGE_HOST_BIT, 0u, 0u,
4213                            nullptr, 1u, &computeToHostBarrier, 0u, nullptr);
4214 
4215     vk::endCommandBuffer(vkd, cmdBuffer);
4216     vk::submitCommandsAndWait(vkd, device, queue, cmdBuffer);
4217 
4218     // Verify buffer contents.
4219     vk::invalidateAlloc(vkd, device, ssboAlloc);
4220     std::unique_ptr<uint32_t[]> valuesArray(new uint32_t[invocations]);
4221     uint32_t *valuesPtr = valuesArray.get();
4222     deMemcpy(valuesPtr, ssboPtr, static_cast<size_t>(ssboSize));
4223 
4224     std::string errorMsg;
4225     bool ok = true;
4226 
4227     for (size_t i = 0; i < invocations; ++i)
4228     {
4229         if (valuesPtr[i] != 1u)
4230         {
4231             ok       = false;
4232             errorMsg = "Found invalid value for invocation index " + de::toString(i) + ": expected 1u and found " +
4233                        de::toString(valuesPtr[i]);
4234             break;
4235         }
4236     }
4237 
4238     if (!ok)
4239         return tcu::TestStatus::fail(errorMsg);
4240     return tcu::TestStatus::pass("Pass");
4241 }
4242 
4243 namespace EmptyShaderTest
4244 {
4245 
checkSupport(Context & context,vk::ComputePipelineConstructionType computePipelineConstructionType)4246 void checkSupport(Context &context, vk::ComputePipelineConstructionType computePipelineConstructionType)
4247 {
4248     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
4249                                   computePipelineConstructionType);
4250 }
4251 
createProgram(SourceCollections & dst,vk::ComputePipelineConstructionType)4252 void createProgram(SourceCollections &dst, vk::ComputePipelineConstructionType)
4253 {
4254     dst.glslSources.add("comp") << glu::ComputeSource("#version 310 es\n"
4255                                                       "layout (local_size_x = 1) in;\n"
4256                                                       "void main (void) {}\n");
4257 }
4258 
createTest(Context & context,vk::ComputePipelineConstructionType computePipelineConstructionType)4259 tcu::TestStatus createTest(Context &context, vk::ComputePipelineConstructionType computePipelineConstructionType)
4260 {
4261     const DeviceInterface &vk       = context.getDeviceInterface();
4262     const VkDevice device           = context.getDevice();
4263     const VkQueue queue             = context.getUniversalQueue();
4264     const uint32_t queueFamilyIndex = context.getUniversalQueueFamilyIndex();
4265 
4266     ComputePipelineWrapper pipeline(vk, device, computePipelineConstructionType,
4267                                     context.getBinaryCollection().get("comp"));
4268     pipeline.buildPipeline();
4269 
4270     const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
4271     const Unique<VkCommandBuffer> cmdBuffer(
4272         allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
4273 
4274     // Start recording commands
4275 
4276     beginCommandBuffer(vk, *cmdBuffer);
4277 
4278     pipeline.bind(*cmdBuffer);
4279 
4280     const tcu::IVec3 workGroups(1, 1, 1);
4281     vk.cmdDispatch(*cmdBuffer, workGroups.x(), workGroups.y(), workGroups.z());
4282 
4283     endCommandBuffer(vk, *cmdBuffer);
4284 
4285     submitCommandsAndWait(vk, device, queue, *cmdBuffer);
4286 
4287     return tcu::TestStatus::pass("Compute succeeded");
4288 }
4289 
4290 } // namespace EmptyShaderTest
4291 
4292 namespace ComputeOnlyQueueTests
4293 {
4294 
getComputeOnlyQueueFamily(Context & context)4295 tcu::Maybe<uint32_t> getComputeOnlyQueueFamily(Context &context)
4296 {
4297     bool foundQueue = false;
4298     uint32_t index  = 0;
4299 
4300     auto queueFamilies =
4301         getPhysicalDeviceQueueFamilyProperties(context.getInstanceInterface(), context.getPhysicalDevice());
4302 
4303     for (const auto &queueFamily : queueFamilies)
4304     {
4305         if ((queueFamily.queueFlags & VK_QUEUE_COMPUTE_BIT) && !(queueFamily.queueFlags & VK_QUEUE_GRAPHICS_BIT))
4306         {
4307             foundQueue = true;
4308             break;
4309         }
4310         else
4311         {
4312             index++;
4313         }
4314     }
4315     if (!foundQueue)
4316     {
4317         return tcu::Maybe<uint32_t>();
4318     }
4319     else
4320     {
4321         return index;
4322     }
4323 }
4324 
4325 // Creates a device that has a queue for compute capabilities without graphics.
createComputeOnlyDevice(vk::VkInstance instance,const InstanceInterface & instanceDriver,const VkPhysicalDevice physicalDevice,Context & context,uint32_t & queueFamilyIndex)4326 Move<VkDevice> createComputeOnlyDevice(vk::VkInstance instance, const InstanceInterface &instanceDriver,
4327                                        const VkPhysicalDevice physicalDevice, Context &context,
4328                                        uint32_t &queueFamilyIndex)
4329 {
4330     const auto queueFamilies = getPhysicalDeviceQueueFamilyProperties(instanceDriver, physicalDevice);
4331 
4332     // One queue family without a graphics bit should be found, since this is checked in checkSupport.
4333     queueFamilyIndex = getComputeOnlyQueueFamily(context).get();
4334 
4335     const float queuePriority                            = 1.0f;
4336     const VkDeviceQueueCreateInfo deviceQueueCreateInfos = {
4337         VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, // VkStructureType sType;
4338         nullptr,                                    // const void* pNext;
4339         (VkDeviceQueueCreateFlags)0u,               // VkDeviceQueueCreateFlags flags;
4340         queueFamilyIndex,                           // uint32_t queueFamilyIndex;
4341         1u,                                         // uint32_t queueCount;
4342         &queuePriority,                             // const float* pQueuePriorities;
4343     };
4344 
4345     void *pNext = nullptr;
4346 #ifdef CTS_USES_VULKANSC
4347     VkDeviceObjectReservationCreateInfo memReservationInfo = context.getTestContext().getCommandLine().isSubProcess() ?
4348                                                                  context.getResourceInterface()->getStatMax() :
4349                                                                  resetDeviceObjectReservationCreateInfo();
4350     pNext                                                  = &memReservationInfo;
4351 
4352     VkPhysicalDeviceVulkanSC10Features sc10Features = createDefaultSC10Features();
4353     sc10Features.pNext                              = pNext;
4354     pNext                                           = &sc10Features;
4355 
4356     VkPipelineCacheCreateInfo pcCI;
4357     std::vector<VkPipelinePoolSize> poolSizes;
4358     if (context.getTestContext().getCommandLine().isSubProcess())
4359     {
4360         if (context.getResourceInterface()->getCacheDataSize() > 0)
4361         {
4362             pcCI = {
4363                 VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO, // VkStructureType sType;
4364                 nullptr,                                      // const void* pNext;
4365                 VK_PIPELINE_CACHE_CREATE_READ_ONLY_BIT |
4366                     VK_PIPELINE_CACHE_CREATE_USE_APPLICATION_STORAGE_BIT, // VkPipelineCacheCreateFlags flags;
4367                 context.getResourceInterface()->getCacheDataSize(),       // uintptr_t initialDataSize;
4368                 context.getResourceInterface()->getCacheData()            // const void* pInitialData;
4369             };
4370             memReservationInfo.pipelineCacheCreateInfoCount = 1;
4371             memReservationInfo.pPipelineCacheCreateInfos    = &pcCI;
4372         }
4373         poolSizes = context.getResourceInterface()->getPipelinePoolSizes();
4374         if (!poolSizes.empty())
4375         {
4376             memReservationInfo.pipelinePoolSizeCount = uint32_t(poolSizes.size());
4377             memReservationInfo.pPipelinePoolSizes    = poolSizes.data();
4378         }
4379     }
4380 #endif // CTS_USES_VULKANSC
4381     const VkDeviceCreateInfo deviceCreateInfo = {
4382         VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, // VkStructureType sType;
4383         pNext,                                // const void* pNext;
4384         (VkDeviceCreateFlags)0u,              // VkDeviceCreateFlags flags;
4385         1,                                    // uint32_t queueCreateInfoCount;
4386         &deviceQueueCreateInfos,              // const VkDeviceQueueCreateInfo* pQueueCreateInfos;
4387         0u,                                   // uint32_t enabledLayerCount;
4388         nullptr,                              // const char* const* ppEnabledLayerNames;
4389         0,                                    // uint32_t enabledExtensionCount;
4390         nullptr,                              // const char* const* ppEnabledExtensionNames;
4391         nullptr,                              // const VkPhysicalDeviceFeatures* pEnabledFeatures;
4392     };
4393 
4394     return vkt::createCustomDevice(context.getTestContext().getCommandLine().isValidationEnabled(),
4395                                    context.getPlatformInterface(), instance, instanceDriver, physicalDevice,
4396                                    &deviceCreateInfo);
4397 }
4398 
4399 class SecondaryCommandBufferComputeOnlyTest : public vkt::TestCase
4400 {
4401 public:
SecondaryCommandBufferComputeOnlyTest(tcu::TestContext & context,const std::string & name)4402     SecondaryCommandBufferComputeOnlyTest(tcu::TestContext &context, const std::string &name)
4403         : vkt::TestCase(context, name){};
4404 
4405     void initPrograms(SourceCollections &programCollection) const override;
4406     TestInstance *createInstance(Context &context) const override;
4407     void checkSupport(Context &context) const override;
4408 };
4409 
4410 class SecondaryCommandBufferComputeOnlyTestInstance : public vkt::TestInstance
4411 {
4412 public:
SecondaryCommandBufferComputeOnlyTestInstance(Context & context)4413     SecondaryCommandBufferComputeOnlyTestInstance(Context &context)
4414         : vkt::TestInstance(context)
4415 #ifdef CTS_USES_VULKANSC
4416         , m_customInstance(createCustomInstanceFromContext(context))
4417 #endif // CTS_USES_VULKANSC
4418               {};
4419     virtual tcu::TestStatus iterate(void);
4420 
4421 protected:
4422 #ifdef CTS_USES_VULKANSC
4423     const CustomInstance m_customInstance;
4424 #endif // CTS_USES_VULKANSC
4425 };
4426 
initPrograms(SourceCollections & collection) const4427 void SecondaryCommandBufferComputeOnlyTest::initPrograms(SourceCollections &collection) const
4428 {
4429     {
4430         std::ostringstream src;
4431         src << glu::getGLSLVersionDeclaration(glu::GLSL_VERSION_450) << "\n"
4432             << "layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;\n"
4433             << "layout(set = 0, binding = 0, std430) buffer Out\n"
4434             << "{\n"
4435             << "    uint data[];\n"
4436             << "};\n"
4437             << "void main (void)\n"
4438             << "{\n"
4439             << "data[0] = 1;"
4440             << "}\n";
4441         collection.glslSources.add("comp") << glu::ComputeSource(src.str());
4442     }
4443 }
4444 
createInstance(Context & context) const4445 TestInstance *SecondaryCommandBufferComputeOnlyTest::createInstance(Context &context) const
4446 {
4447     return new SecondaryCommandBufferComputeOnlyTestInstance(context);
4448 }
4449 
checkSupport(Context & context) const4450 void SecondaryCommandBufferComputeOnlyTest::checkSupport(Context &context) const
4451 {
4452     // Find at least one queue family that supports compute queue but does NOT support graphics queue.
4453     if (!getComputeOnlyQueueFamily(context))
4454         TCU_THROW(NotSupportedError, "No queue family found that only supports compute queue.");
4455 }
4456 
iterate()4457 tcu::TestStatus SecondaryCommandBufferComputeOnlyTestInstance::iterate()
4458 {
4459     VkDevice device;
4460     uint32_t queueFamilyIndex;
4461 #ifdef CTS_USES_VULKANSC
4462     const vk::InstanceInterface &vki = m_customInstance.getDriver();
4463     const VkPhysicalDevice physDevice =
4464         chooseDevice(vki, m_customInstance, m_context.getTestContext().getCommandLine());
4465     auto customDevice = createComputeOnlyDevice(m_customInstance, vki, physDevice, m_context, queueFamilyIndex);
4466     de::MovePtr<DeviceDriverSC, DeinitDeviceDeleter> deviceDriver;
4467 #else
4468     const InstanceInterface &vki      = m_context.getInstanceInterface();
4469     const VkPhysicalDevice physDevice = m_context.getPhysicalDevice();
4470     auto customDevice = createComputeOnlyDevice(m_context.getInstance(), vki, physDevice, m_context, queueFamilyIndex);
4471     de::MovePtr<DeviceDriver> deviceDriver;
4472 #endif // CTS_USES_VULKANSC
4473 
4474     device = customDevice.get();
4475 
4476 #ifndef CTS_USES_VULKANSC
4477     deviceDriver = de::MovePtr<DeviceDriver>(new DeviceDriver(m_context.getPlatformInterface(), m_context.getInstance(),
4478                                                               device, m_context.getUsedApiVersion(),
4479                                                               m_context.getTestContext().getCommandLine()));
4480 #else
4481     deviceDriver = de::MovePtr<DeviceDriverSC, DeinitDeviceDeleter>(
4482         new DeviceDriverSC(m_context.getPlatformInterface(), m_customInstance, device,
4483                            m_context.getTestContext().getCommandLine(), m_context.getResourceInterface(),
4484                            m_context.getDeviceVulkanSC10Properties(), m_context.getDeviceProperties(),
4485                            m_context.getUsedApiVersion()),
4486         DeinitDeviceDeleter(m_context.getResourceInterface().get(), device));
4487 #endif // CTS_USES_VULKANSC
4488 
4489     const DeviceInterface &vkdi = *deviceDriver;
4490 
4491     auto queue = getDeviceQueue(vkdi, device, queueFamilyIndex, 0u);
4492     auto allocator =
4493         de::MovePtr<Allocator>(new SimpleAllocator(vkdi, device, getPhysicalDeviceMemoryProperties(vki, physDevice)));
4494 
4495     const auto bufferSize = static_cast<VkDeviceSize>(sizeof(uint32_t));
4496     BufferWithMemory buffer(vkdi, device, *allocator.get(),
4497                             makeBufferCreateInfo(bufferSize, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
4498                             MemoryRequirement::HostVisible);
4499     auto &bufferAlloc = buffer.getAllocation();
4500     void *bufferData  = bufferAlloc.getHostPtr();
4501     deMemset(bufferData, 0, sizeof(uint32_t));
4502     flushAlloc(vkdi, device, bufferAlloc);
4503 
4504     DescriptorSetLayoutBuilder layoutBuilder;
4505     layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT);
4506     Unique<VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vkdi, device));
4507 
4508     DescriptorPoolBuilder poolBuilder;
4509     poolBuilder.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
4510     const auto descriptorPool = poolBuilder.build(vkdi, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1);
4511     const auto descriptorSetBuffer = makeDescriptorSet(vkdi, device, descriptorPool.get(), descriptorSetLayout.get());
4512 
4513     // Update descriptor sets.
4514     DescriptorSetUpdateBuilder updater;
4515 
4516     const auto bufferInfo = makeDescriptorBufferInfo(buffer.get(), 0ull, bufferSize);
4517     updater.writeSingle(descriptorSetBuffer.get(), DescriptorSetUpdateBuilder::Location::binding(0u),
4518                         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferInfo);
4519 
4520     updater.update(vkdi, device);
4521 
4522     auto shader = createShaderModule(vkdi, device, m_context.getBinaryCollection().get("comp"));
4523     // Create compute pipeline
4524     const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vkdi, device, *descriptorSetLayout));
4525     const Unique<VkPipeline> computePipeline(makeComputePipeline(vkdi, device, *pipelineLayout, *shader));
4526 
4527     // Create command buffer
4528     const Unique<VkCommandPool> cmdPool(makeCommandPool(vkdi, device, queueFamilyIndex));
4529     const Unique<VkCommandBuffer> cmdBuffer(
4530         allocateCommandBuffer(vkdi, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
4531     const Unique<VkCommandBuffer> cmdBuffer2(
4532         allocateCommandBuffer(vkdi, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_SECONDARY));
4533 
4534     const VkCommandBufferInheritanceInfo bufferInheritanceInfo{
4535         VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_INFO, // VkStructureType sType;
4536         nullptr,                                           // const void* pNext;
4537         VK_NULL_HANDLE,                                    // VkRenderPass renderPass;
4538         0u,                                                // uint32_t subpass;
4539         VK_NULL_HANDLE,                                    // VkFramebuffer framebuffer;
4540         VK_FALSE,                                          // VkBool32 occlusionQueryEnable;
4541         (VkQueryControlFlags)0u,                           // VkQueryControlFlags queryFlags;
4542         (VkQueryPipelineStatisticFlags)0u                  // VkQueryPipelineStatisticFlags pipelineStatistics;
4543     };
4544 
4545     VkCommandBufferUsageFlags usageFlags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
4546     const VkCommandBufferBeginInfo commandBufBeginParams{
4547         VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, // VkStructureType sType;
4548         nullptr,                                     // const void* pNext;
4549         usageFlags,                                  // VkCommandBufferUsageFlags flags;
4550         &bufferInheritanceInfo};
4551 
4552     beginCommandBuffer(vkdi, cmdBuffer.get());
4553     vkdi.beginCommandBuffer(cmdBuffer2.get(), &commandBufBeginParams);
4554     vkdi.cmdBindPipeline(cmdBuffer2.get(), VK_PIPELINE_BIND_POINT_COMPUTE, computePipeline.get());
4555     vkdi.cmdBindDescriptorSets(cmdBuffer2.get(), VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayout.get(), 0u, 1,
4556                                &descriptorSetBuffer.get(), 0u, nullptr);
4557     vkdi.cmdDispatch(cmdBuffer2.get(), 1, 1, 1);
4558     endCommandBuffer(vkdi, cmdBuffer2.get());
4559     vkdi.cmdExecuteCommands(cmdBuffer.get(), 1, &cmdBuffer2.get());
4560     const VkBufferMemoryBarrier renderBufferBarrier =
4561         makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, buffer.get(), 0ull, bufferSize);
4562     cmdPipelineBufferMemoryBarrier(vkdi, cmdBuffer.get(), VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
4563                                    VK_PIPELINE_STAGE_HOST_BIT, &renderBufferBarrier);
4564     endCommandBuffer(vkdi, cmdBuffer.get());
4565     submitCommandsAndWait(vkdi, device, queue, cmdBuffer.get());
4566 
4567     invalidateAlloc(vkdi, device, bufferAlloc);
4568 
4569     uint32_t result = 0;
4570     deMemcpy(&result, bufferData, sizeof(uint32_t));
4571     if (result != 1)
4572     {
4573         return tcu::TestStatus::pass("value of buffer unexpected");
4574     }
4575 
4576     return tcu::TestStatus::pass("passed");
4577 }
4578 
4579 }; // namespace ComputeOnlyQueueTests
4580 
4581 enum CompositeType
4582 {
4583     VECTOR,
4584     MATRIX,
4585     ARRAY,
4586     ARRAY_ARRAY,
4587     STRUCT,
4588     STRUCT_STRUCT,
4589     COOPMAT,
4590 };
4591 enum InstType
4592 {
4593     VALUE,
4594     CONSTANT,
4595     SPECCONSTANT,
4596 };
4597 
4598 #ifndef CTS_USES_VULKANSC
4599 
4600 class ReplicatedCompositesTest : public vkt::TestCase
4601 {
4602 public:
4603     ReplicatedCompositesTest(tcu::TestContext &testCtx, const std::string &name, const CompositeType compositeType,
4604                              const InstType instType,
4605                              const vk::ComputePipelineConstructionType computePipelineConstructionType);
4606 
4607     virtual void checkSupport(Context &context) const;
4608     void initPrograms(SourceCollections &sourceCollections) const;
4609     TestInstance *createInstance(Context &context) const;
4610 
4611 private:
4612     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
4613     CompositeType m_compositeType;
4614     InstType m_instType;
4615 };
4616 
4617 class ReplicatedCompositesTestInstance : public vkt::TestInstance
4618 {
4619 public:
4620     ReplicatedCompositesTestInstance(Context &context, const CompositeType compositeType, const InstType instType,
4621                                      const vk::ComputePipelineConstructionType computePipelineConstructionType);
4622 
4623     tcu::TestStatus iterate(void);
4624 
4625 private:
4626     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
4627     CompositeType m_compositeType;
4628     InstType m_instType;
4629 };
4630 
ReplicatedCompositesTest(tcu::TestContext & testCtx,const std::string & name,const CompositeType compositeType,const InstType instType,const vk::ComputePipelineConstructionType computePipelineConstructionType)4631 ReplicatedCompositesTest::ReplicatedCompositesTest(
4632     tcu::TestContext &testCtx, const std::string &name, const CompositeType compositeType, const InstType instType,
4633     const vk::ComputePipelineConstructionType computePipelineConstructionType)
4634     : TestCase(testCtx, name)
4635     , m_computePipelineConstructionType(computePipelineConstructionType)
4636     , m_compositeType(compositeType)
4637     , m_instType(instType)
4638 {
4639 }
4640 
checkSupport(Context & context) const4641 void ReplicatedCompositesTest::checkSupport(Context &context) const
4642 {
4643     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
4644                                   m_computePipelineConstructionType);
4645 
4646 #ifndef CTS_USES_VULKANSC
4647     if (!context.getShaderReplicatedCompositesFeaturesEXT().shaderReplicatedComposites)
4648     {
4649         TCU_THROW(NotSupportedError, "shaderReplicatedComposites not supported");
4650     }
4651 
4652     if (m_compositeType == COOPMAT)
4653     {
4654         const InstanceInterface &vki = context.getInstanceInterface();
4655         if (!context.getCooperativeMatrixFeatures().cooperativeMatrix)
4656         {
4657             TCU_THROW(NotSupportedError,
4658                       "VkPhysicalDeviceCooperativeMatrixFeaturesKHR::cooperativeMatrix not supported");
4659         }
4660 
4661         uint32_t propertyCount = 0;
4662 
4663         VK_CHECK(
4664             vki.getPhysicalDeviceCooperativeMatrixPropertiesKHR(context.getPhysicalDevice(), &propertyCount, nullptr));
4665 
4666         const VkCooperativeMatrixPropertiesKHR initStruct = initVulkanStructureConst();
4667 
4668         std::vector<VkCooperativeMatrixPropertiesKHR> properties(propertyCount, initStruct);
4669 
4670         VK_CHECK(vki.getPhysicalDeviceCooperativeMatrixPropertiesKHR(context.getPhysicalDevice(), &propertyCount,
4671                                                                      properties.data()));
4672 
4673         bool foundFp16 = false;
4674         for (size_t i = 0; i < properties.size(); ++i)
4675         {
4676             const VkCooperativeMatrixPropertiesKHR *p = &properties[i];
4677 
4678             if (p->scope != VK_SCOPE_SUBGROUP_KHR)
4679                 continue;
4680 
4681             if (p->AType == VK_COMPONENT_TYPE_FLOAT16_KHR)
4682                 foundFp16 = true;
4683         }
4684         if (!foundFp16)
4685         {
4686             TCU_THROW(NotSupportedError, "cooperativeMatrix float16 not supported");
4687         }
4688     }
4689 #endif // CTS_USES_VULKANSC
4690 }
4691 
initPrograms(SourceCollections & sourceCollections) const4692 void ReplicatedCompositesTest::initPrograms(SourceCollections &sourceCollections) const
4693 {
4694     std::ostringstream src;
4695     src << "#version 460 core\n"
4696         << "#extension GL_EXT_scalar_block_layout : enable\n"
4697         << "#extension GL_KHR_cooperative_matrix : enable\n"
4698         << "#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable\n"
4699         << "#extension GL_KHR_memory_scope_semantics : enable\n"
4700         << "#extension GL_EXT_spec_constant_composites : enable\n"
4701         << "#pragma use_replicated_composites\n"
4702         << "layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;\n"
4703         << "layout(binding = 0, scalar) buffer Output {\n";
4704 
4705     switch (m_compositeType)
4706     {
4707     case VECTOR:
4708         src << "float vec[4];\n";
4709         break;
4710     case MATRIX:
4711         src << "float mat[4*4];\n";
4712         break;
4713     case ARRAY:
4714         src << "uint arr[3];\n";
4715         break;
4716     case ARRAY_ARRAY:
4717         src << "uint arrarr[6];\n";
4718         break;
4719     case STRUCT:
4720         src << "uint str[3];\n";
4721         break;
4722     case STRUCT_STRUCT:
4723         src << "uint str[6];\n";
4724         break;
4725     case COOPMAT:
4726         src << "float mat[2];\n";
4727         break;
4728     default:
4729         DE_ASSERT(0);
4730         break;
4731     }
4732     src << "} sb_out;\n\n";
4733 
4734     if (m_compositeType == COOPMAT)
4735     {
4736         src << "layout(constant_id = 1) const uint rows = 1;\n"
4737             << "layout(constant_id = 2) const uint cols = 1;\n";
4738     }
4739 
4740     if (m_instType != VALUE)
4741     {
4742         if (m_instType == SPECCONSTANT)
4743         {
4744             src << "layout(constant_id = 0) ";
4745         }
4746         switch (m_compositeType)
4747         {
4748         case VECTOR:
4749             src << "const float one = 1.0;\n"
4750                 << "const vec4 vec = vec4(one);\n";
4751             break;
4752         case MATRIX:
4753             src << "const float one = 1.0;\n"
4754                 << "const vec4 vec = vec4(one);\n"
4755                 << "const mat4 mat = mat4(vec, vec, vec, vec);\n";
4756             break;
4757         case ARRAY:
4758             src << "const uint three = 3;\n"
4759                 << "const uint arr[3] = {three, three, three};\n";
4760             break;
4761         case ARRAY_ARRAY:
4762             src << "const uint three = 3;\n"
4763                 << "const uint arr[3] = {three, three, three};\n"
4764                 << "const uint arrarr[2][3] = {arr, arr};\n";
4765             break;
4766         case STRUCT:
4767             src << "const uint six = 6;\n"
4768                 << "struct S { uint a; uint b; uint c; };\n"
4769                 << "const S str = S(six, six, six);\n\n";
4770             break;
4771         case STRUCT_STRUCT:
4772             src << "const uint six = 6;\n"
4773                 << "struct S { uint a; uint b; uint c; };\n"
4774                 << "struct SS { S a; S b; };\n"
4775                 << "const S str = S(six, six, six);\n"
4776                 << "const SS str2 = SS(str, str);\n\n";
4777             break;
4778         case COOPMAT:
4779             src << "const float one = 1.0;\n"
4780                 << "const coopmat<float16_t, gl_ScopeSubgroup, rows, cols, gl_MatrixUseA> mat = coopmat<float16_t, "
4781                    "gl_ScopeSubgroup, rows, cols, gl_MatrixUseA>(one);\n";
4782             break;
4783         default:
4784             DE_ASSERT(0);
4785             break;
4786         }
4787     }
4788     src << "void main (void) {\n";
4789 
4790     if (m_instType == VALUE)
4791     {
4792         switch (m_compositeType)
4793         {
4794         case VECTOR:
4795             src << "    float one = 1.0;\n"
4796                 << "    vec4 vec = vec4(one);\n";
4797             break;
4798         case MATRIX:
4799             src << "    float one = 1.0;\n"
4800                 << "    vec4 vec = vec4(one);\n"
4801                 << "    mat4 mat = mat4(vec, vec, vec, vec);\n";
4802             break;
4803         case ARRAY:
4804             src << "    uint three = 3;\n"
4805                 << "    uint arr[3] = {three, three, three};\n";
4806             break;
4807         case ARRAY_ARRAY:
4808             src << "    uint three = 3;\n"
4809                 << "    uint arr[3] = {three, three, three};\n"
4810                 << "    uint arrarr[2][3] = {arr, arr};\n";
4811             break;
4812         case STRUCT:
4813             src << "    uint six = 6;\n"
4814                 << "    struct S { uint a; uint b; uint c; };\n"
4815                 << "    S str = S(six, six, six);\n\n";
4816             break;
4817         case STRUCT_STRUCT:
4818             src << "    uint six = 6;\n"
4819                 << "    struct S { uint a; uint b; uint c; };\n"
4820                 << "    struct SS { S a; S b; };\n"
4821                 << "    S str = S(six, six, six);\n"
4822                 << "    SS str2 = SS(str, str);\n\n";
4823             break;
4824         case COOPMAT:
4825             src << "    float one = 1.0;\n"
4826                 << "    coopmat<float16_t, gl_ScopeSubgroup, rows, cols, gl_MatrixUseA> mat = coopmat<float16_t, "
4827                    "gl_ScopeSubgroup, rows, cols, gl_MatrixUseA>(one);\n";
4828             break;
4829         default:
4830             DE_ASSERT(0);
4831             break;
4832         }
4833     }
4834     switch (m_compositeType)
4835     {
4836     case VECTOR:
4837         src << "    sb_out.vec[0] = vec[0];\n"
4838             << "    sb_out.vec[1] = vec[1];\n"
4839             << "    sb_out.vec[2] = vec[2];\n"
4840             << "    sb_out.vec[3] = vec[3];\n";
4841         break;
4842     case MATRIX:
4843         src << "    sb_out.mat[0] = mat[0][0];\n"
4844             << "    sb_out.mat[1] = mat[0][1];\n"
4845             << "    sb_out.mat[2] = mat[0][2];\n"
4846             << "    sb_out.mat[3] = mat[0][3];\n"
4847             << "    sb_out.mat[4] = mat[1][0];\n"
4848             << "    sb_out.mat[5] = mat[1][1];\n"
4849             << "    sb_out.mat[6] = mat[1][2];\n"
4850             << "    sb_out.mat[7] = mat[1][3];\n"
4851             << "    sb_out.mat[8] = mat[2][0];\n"
4852             << "    sb_out.mat[9] = mat[2][1];\n"
4853             << "    sb_out.mat[10] = mat[2][2];\n"
4854             << "    sb_out.mat[11] = mat[2][3];\n"
4855             << "    sb_out.mat[12] = mat[3][0];\n"
4856             << "    sb_out.mat[13] = mat[3][1];\n"
4857             << "    sb_out.mat[14] = mat[3][2];\n"
4858             << "    sb_out.mat[15] = mat[3][3];\n";
4859         break;
4860     case ARRAY:
4861         src << "    sb_out.arr[0] = arr[0];\n"
4862             << "    sb_out.arr[1] = arr[1];\n"
4863             << "    sb_out.arr[2] = arr[2];\n";
4864         break;
4865     case ARRAY_ARRAY:
4866         src << "    sb_out.arrarr[0] = arrarr[0][0];\n"
4867             << "    sb_out.arrarr[1] = arrarr[0][1];\n"
4868             << "    sb_out.arrarr[2] = arrarr[0][2];\n"
4869             << "    sb_out.arrarr[3] = arrarr[1][0];\n"
4870             << "    sb_out.arrarr[4] = arrarr[1][1];\n"
4871             << "    sb_out.arrarr[5] = arrarr[1][2];\n";
4872         break;
4873     case STRUCT:
4874         src << "    sb_out.str[0] = str.a;\n"
4875             << "    sb_out.str[1] = str.b;\n"
4876             << "    sb_out.str[2] = str.c;\n";
4877         break;
4878     case STRUCT_STRUCT:
4879         src << "    sb_out.str[0] = str2.a.a;\n"
4880             << "    sb_out.str[1] = str2.a.b;\n"
4881             << "    sb_out.str[2] = str2.a.c;\n"
4882             << "    sb_out.str[3] = str2.b.a;\n"
4883             << "    sb_out.str[4] = str2.b.b;\n"
4884             << "    sb_out.str[5] = str2.b.c;\n";
4885         break;
4886     case COOPMAT:
4887         src << "    sb_out.mat[0] = float(mat[0]);\n"
4888             << "    sb_out.mat[1] = (mat.length() > 1) ? float(mat[1]) : float(mat[0]);\n";
4889         break;
4890     default:
4891         DE_ASSERT(0);
4892         break;
4893     }
4894     src << "}\n";
4895 
4896     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
4897 }
4898 
createInstance(Context & context) const4899 TestInstance *ReplicatedCompositesTest::createInstance(Context &context) const
4900 {
4901     return new ReplicatedCompositesTestInstance(context, m_compositeType, m_instType,
4902                                                 m_computePipelineConstructionType);
4903 }
4904 
ReplicatedCompositesTestInstance(Context & context,const CompositeType compositeType,const InstType instType,const vk::ComputePipelineConstructionType computePipelineConstructionType)4905 ReplicatedCompositesTestInstance::ReplicatedCompositesTestInstance(
4906     Context &context, const CompositeType compositeType, const InstType instType,
4907     const vk::ComputePipelineConstructionType computePipelineConstructionType)
4908     : TestInstance(context)
4909     , m_computePipelineConstructionType(computePipelineConstructionType)
4910     , m_compositeType(compositeType)
4911     , m_instType(instType)
4912 {
4913 }
4914 
iterate(void)4915 tcu::TestStatus ReplicatedCompositesTestInstance::iterate(void)
4916 {
4917     const DeviceInterface &vk       = m_context.getDeviceInterface();
4918     const VkDevice device           = m_context.getDevice();
4919     const VkQueue queue             = m_context.getUniversalQueue();
4920     const uint32_t queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
4921     Allocator &allocator            = m_context.getDefaultAllocator();
4922 
4923     // Create a buffer and host-visible memory for it
4924 
4925     const VkDeviceSize bufferSizeBytes = 256;
4926     const BufferWithMemory buffer(vk, device, allocator,
4927                                   makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
4928                                   MemoryRequirement::HostVisible);
4929 
4930     const Allocation &bufferAllocation = buffer.getAllocation();
4931     deMemset(bufferAllocation.getHostPtr(), 0, bufferSizeBytes);
4932 
4933     flushAlloc(vk, device, bufferAllocation);
4934     // Create descriptor set
4935 
4936     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
4937         DescriptorSetLayoutBuilder()
4938             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
4939             .build(vk, device));
4940 
4941     const Unique<VkDescriptorPool> descriptorPool(
4942         DescriptorPoolBuilder()
4943             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
4944             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
4945 
4946     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
4947 
4948     const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
4949     DescriptorSetUpdateBuilder()
4950         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
4951                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
4952         .update(vk, device);
4953 
4954     // Perform the computation
4955     ComputePipelineWrapper pipeline(vk, device, m_computePipelineConstructionType,
4956                                     m_context.getBinaryCollection().get("comp"));
4957     pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
4958 
4959     uint32_t coopmatRows = 0, coopmatCols = 0;
4960 #ifndef CTS_USES_VULKANSC
4961     if (m_compositeType == COOPMAT)
4962     {
4963         const InstanceInterface &vki = m_context.getInstanceInterface();
4964         uint32_t propertyCount       = 0;
4965 
4966         VK_CHECK(vki.getPhysicalDeviceCooperativeMatrixPropertiesKHR(m_context.getPhysicalDevice(), &propertyCount,
4967                                                                      nullptr));
4968 
4969         const VkCooperativeMatrixPropertiesKHR initStruct = initVulkanStructureConst();
4970 
4971         std::vector<VkCooperativeMatrixPropertiesKHR> properties(propertyCount, initStruct);
4972 
4973         VK_CHECK(vki.getPhysicalDeviceCooperativeMatrixPropertiesKHR(m_context.getPhysicalDevice(), &propertyCount,
4974                                                                      properties.data()));
4975 
4976         for (size_t i = 0; i < properties.size(); ++i)
4977         {
4978             const VkCooperativeMatrixPropertiesKHR *p = &properties[i];
4979 
4980             if (p->scope != VK_SCOPE_SUBGROUP_KHR)
4981                 continue;
4982 
4983             if (p->AType == VK_COMPONENT_TYPE_FLOAT16_KHR)
4984             {
4985                 if (p->MSize * p->KSize > coopmatRows * coopmatCols)
4986                 {
4987                     coopmatRows = p->MSize;
4988                     coopmatCols = p->KSize;
4989                 }
4990             }
4991         }
4992         DE_ASSERT(coopmatRows * coopmatCols > 0);
4993     }
4994 #endif // CTS_USES_VULKANSC
4995 
4996     uint32_t specializationData[3]                           = {deFloatBitsToUint32(2.0f), coopmatRows, coopmatCols};
4997     const vk::VkSpecializationMapEntry specializationMaps[3] = {
4998         {
4999             0u,               // uint32_t constantID;
5000             0u,               // uint32_t offset;
5001             sizeof(uint32_t), // uintptr_t size;
5002         },
5003         {
5004             1u,               // uint32_t constantID;
5005             4u,               // uint32_t offset;
5006             sizeof(uint32_t), // uintptr_t size;
5007         },
5008         {
5009             2u,               // uint32_t constantID;
5010             8u,               // uint32_t offset;
5011             sizeof(uint32_t), // uintptr_t size;
5012         },
5013     };
5014     const vk::VkSpecializationInfo specializationInfo = {
5015         3u,                                                 // uint32_t mapEntryCount;
5016         specializationMaps,                                 // const VkSpecializationMapEntry* pMapEntries;
5017         static_cast<uintptr_t>(sizeof(specializationData)), // uintptr_t dataSize;
5018         specializationData,                                 // const void* pData;
5019     };
5020     pipeline.setSpecializationInfo(specializationInfo);
5021     pipeline.buildPipeline();
5022 
5023     const VkBufferMemoryBarrier computeFinishBarrier =
5024         makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
5025 
5026     const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
5027     const Unique<VkCommandBuffer> cmdBuffer(
5028         allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
5029 
5030     // Start recording commands
5031 
5032     beginCommandBuffer(vk, *cmdBuffer);
5033 
5034     pipeline.bind(*cmdBuffer);
5035     vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
5036                              &descriptorSet.get(), 0u, nullptr);
5037 
5038     vk.cmdDispatch(*cmdBuffer, 1, 1, 1);
5039 
5040     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
5041                           (VkDependencyFlags)0, 0, nullptr, 1, &computeFinishBarrier, 0, nullptr);
5042 
5043     endCommandBuffer(vk, *cmdBuffer);
5044 
5045     // Wait for completion
5046 
5047     submitCommandsAndWait(vk, device, queue, *cmdBuffer);
5048 
5049     // Validate the results
5050 
5051     invalidateAlloc(vk, device, bufferAllocation);
5052 
5053     const void *outStruct = bufferAllocation.getHostPtr();
5054 
5055     {
5056         float vecelem       = m_instType == SPECCONSTANT ? 2.0f : 1.0f;
5057         float vecRef[4]     = {vecelem, vecelem, vecelem, vecelem};
5058         float matRef[4 * 4] = {vecelem, vecelem, vecelem, vecelem, vecelem, vecelem, vecelem, vecelem,
5059                                vecelem, vecelem, vecelem, vecelem, vecelem, vecelem, vecelem, vecelem};
5060         float coopmatRef[2] = {vecelem, vecelem};
5061 
5062         uint32_t arrElem      = m_instType == SPECCONSTANT ? deFloatBitsToUint32(2.0f) : 3;
5063         uint32_t arrRef[3]    = {arrElem, arrElem, arrElem};
5064         uint32_t arrarrRef[6] = {arrElem, arrElem, arrElem, arrElem, arrElem, arrElem};
5065 
5066         uint32_t strElem      = m_instType == SPECCONSTANT ? deFloatBitsToUint32(2.0f) : 6;
5067         uint32_t strRef[3]    = {strElem, strElem, strElem};
5068         uint32_t strstrRef[6] = {strElem, strElem, strElem, strElem, strElem, strElem};
5069 
5070         const void *ref  = nullptr;
5071         size_t sizeofref = 0;
5072 
5073         switch (m_compositeType)
5074         {
5075         case VECTOR:
5076             ref       = vecRef;
5077             sizeofref = sizeof(vecRef);
5078             break;
5079         case MATRIX:
5080             ref       = matRef;
5081             sizeofref = sizeof(matRef);
5082             break;
5083         case ARRAY:
5084             ref       = arrRef;
5085             sizeofref = sizeof(arrRef);
5086             break;
5087         case ARRAY_ARRAY:
5088             ref       = arrarrRef;
5089             sizeofref = sizeof(arrarrRef);
5090             break;
5091         case STRUCT:
5092             ref       = strRef;
5093             sizeofref = sizeof(strRef);
5094             break;
5095         case STRUCT_STRUCT:
5096             ref       = strstrRef;
5097             sizeofref = sizeof(strstrRef);
5098             break;
5099         case COOPMAT:
5100             ref       = coopmatRef;
5101             sizeofref = sizeof(coopmatRef);
5102             break;
5103         default:
5104             DE_ASSERT(0);
5105             break;
5106         }
5107         DE_ASSERT(sizeofref <= bufferSizeBytes);
5108 
5109         if (deMemCmp(outStruct, ref, sizeofref) != 0)
5110         {
5111             return tcu::TestStatus::fail("Comparison failed");
5112         }
5113     }
5114     return tcu::TestStatus::pass("Compute succeeded");
5115 }
5116 #endif // ifndef CTS_USES_VULKANSC
5117 
5118 } // namespace
5119 
createBasicComputeShaderTests(tcu::TestContext & testCtx,vk::ComputePipelineConstructionType computePipelineConstructionType)5120 tcu::TestCaseGroup *createBasicComputeShaderTests(tcu::TestContext &testCtx,
5121                                                   vk::ComputePipelineConstructionType computePipelineConstructionType)
5122 {
5123     // Basic compute tests
5124     de::MovePtr<tcu::TestCaseGroup> basicComputeTests(new tcu::TestCaseGroup(testCtx, "basic"));
5125 
5126     // Shader that does nothing
5127     addFunctionCaseWithPrograms(basicComputeTests.get(), "empty_shader", EmptyShaderTest::checkSupport,
5128                                 EmptyShaderTest::createProgram, EmptyShaderTest::createTest,
5129                                 computePipelineConstructionType);
5130 
5131     // Concurrent compute test
5132     basicComputeTests->addChild(new ConcurrentCompute(testCtx, "concurrent_compute", computePipelineConstructionType));
5133 
5134     // Use an empty workgroup with size 0 on the X axis
5135     basicComputeTests->addChild(
5136         new EmptyWorkGroupCase(testCtx, "empty_workgroup_x", tcu::UVec3(0u, 2u, 3u), computePipelineConstructionType));
5137     // Use an empty workgroup with size 0 on the Y axis
5138     basicComputeTests->addChild(
5139         new EmptyWorkGroupCase(testCtx, "empty_workgroup_y", tcu::UVec3(2u, 0u, 3u), computePipelineConstructionType));
5140     // Use an empty workgroup with size 0 on the Z axis
5141     basicComputeTests->addChild(
5142         new EmptyWorkGroupCase(testCtx, "empty_workgroup_z", tcu::UVec3(2u, 3u, 0u), computePipelineConstructionType));
5143     // Use an empty workgroup with size 0 on the X, Y and Z axes
5144     basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_all", tcu::UVec3(0u, 0u, 0u),
5145                                                        computePipelineConstructionType));
5146 
5147     // Use the maximum work group size on the X axis
5148     basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_x",
5149                                                          MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::X},
5150                                                          computePipelineConstructionType));
5151     // Use the maximum work group size on the Y axis
5152     basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_y",
5153                                                          MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::Y},
5154                                                          computePipelineConstructionType));
5155     // Use the maximum work group size on the Z axis
5156     basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_z",
5157                                                          MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::Z},
5158                                                          computePipelineConstructionType));
5159 
5160     // Concurrent compute test
5161     basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(
5162         testCtx, "ubo_to_ssbo_single_invocation", 256, tcu::IVec3(1, 1, 1), tcu::IVec3(1, 1, 1),
5163         computePipelineConstructionType));
5164     basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx, "ubo_to_ssbo_single_group", 1024,
5165                                                                               tcu::IVec3(2, 1, 4), tcu::IVec3(1, 1, 1),
5166                                                                               computePipelineConstructionType));
5167     basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(
5168         testCtx, "ubo_to_ssbo_multiple_invocations", 1024, tcu::IVec3(1, 1, 1), tcu::IVec3(2, 4, 1),
5169         computePipelineConstructionType));
5170     basicComputeTests->addChild(
5171         BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx, "ubo_to_ssbo_multiple_groups", 1024, tcu::IVec3(1, 4, 2),
5172                                                       tcu::IVec3(2, 2, 4), computePipelineConstructionType));
5173 
5174     // Concurrent compute test
5175     basicComputeTests->addChild(
5176         BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx, "copy_ssbo_single_invocation", 256, tcu::IVec3(1, 1, 1),
5177                                                      tcu::IVec3(1, 1, 1), computePipelineConstructionType));
5178     basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(
5179         testCtx, "copy_ssbo_multiple_invocations", 1024, tcu::IVec3(1, 1, 1), tcu::IVec3(2, 4, 1),
5180         computePipelineConstructionType));
5181     basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx, "copy_ssbo_multiple_groups", 1024,
5182                                                                              tcu::IVec3(1, 4, 2), tcu::IVec3(2, 2, 4),
5183                                                                              computePipelineConstructionType));
5184 
5185     // Read and write same SSBO
5186     basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx, "ssbo_rw_single_invocation", 256, true,
5187                                                           tcu::IVec3(1, 1, 1), tcu::IVec3(1, 1, 1),
5188                                                           computePipelineConstructionType));
5189     basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx, "ssbo_rw_multiple_groups", 1024, true,
5190                                                           tcu::IVec3(1, 4, 2), tcu::IVec3(2, 2, 4),
5191                                                           computePipelineConstructionType));
5192     basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx, "ssbo_unsized_arr_single_invocation", 256, false,
5193                                                           tcu::IVec3(1, 1, 1), tcu::IVec3(1, 1, 1),
5194                                                           computePipelineConstructionType));
5195     basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx, "ssbo_unsized_arr_multiple_groups", 1024, false,
5196                                                           tcu::IVec3(1, 4, 2), tcu::IVec3(2, 2, 4),
5197                                                           computePipelineConstructionType));
5198 
5199     // Write to multiple SSBOs
5200     basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx, "write_multiple_arr_single_invocation", 256, true,
5201                                                             tcu::IVec3(1, 1, 1), tcu::IVec3(1, 1, 1),
5202                                                             computePipelineConstructionType));
5203     basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx, "write_multiple_arr_multiple_groups", 1024, true,
5204                                                             tcu::IVec3(1, 4, 2), tcu::IVec3(2, 2, 4),
5205                                                             computePipelineConstructionType));
5206     basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx, "write_multiple_unsized_arr_single_invocation",
5207                                                             256, false, tcu::IVec3(1, 1, 1), tcu::IVec3(1, 1, 1),
5208                                                             computePipelineConstructionType));
5209     basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx, "write_multiple_unsized_arr_multiple_groups", 1024,
5210                                                             false, tcu::IVec3(1, 4, 2), tcu::IVec3(2, 2, 4),
5211                                                             computePipelineConstructionType));
5212 
5213     // SSBO local barrier usage
5214     basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx, "ssbo_local_barrier_single_invocation",
5215                                                          tcu::IVec3(1, 1, 1), tcu::IVec3(1, 1, 1),
5216                                                          computePipelineConstructionType));
5217     basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx, "ssbo_local_barrier_single_group",
5218                                                          tcu::IVec3(3, 2, 5), tcu::IVec3(1, 1, 1),
5219                                                          computePipelineConstructionType));
5220     basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx, "ssbo_local_barrier_multiple_groups",
5221                                                          tcu::IVec3(3, 4, 1), tcu::IVec3(2, 7, 3),
5222                                                          computePipelineConstructionType));
5223 
5224     // SSBO memory barrier usage
5225     basicComputeTests->addChild(
5226         new SSBOBarrierTest(testCtx, "ssbo_cmd_barrier_single", tcu::IVec3(1, 1, 1), computePipelineConstructionType));
5227     basicComputeTests->addChild(new SSBOBarrierTest(testCtx, "ssbo_cmd_barrier_multiple", tcu::IVec3(11, 5, 7),
5228                                                     computePipelineConstructionType));
5229 
5230     // Basic shared variable usage
5231     basicComputeTests->addChild(new SharedVarTest(testCtx, "shared_var_single_invocation", tcu::IVec3(1, 1, 1),
5232                                                   tcu::IVec3(1, 1, 1), computePipelineConstructionType));
5233     basicComputeTests->addChild(new SharedVarTest(testCtx, "shared_var_single_group", tcu::IVec3(3, 2, 5),
5234                                                   tcu::IVec3(1, 1, 1), computePipelineConstructionType));
5235     basicComputeTests->addChild(new SharedVarTest(testCtx, "shared_var_multiple_invocations", tcu::IVec3(1, 1, 1),
5236                                                   tcu::IVec3(2, 5, 4), computePipelineConstructionType));
5237     basicComputeTests->addChild(new SharedVarTest(testCtx, "shared_var_multiple_groups", tcu::IVec3(3, 4, 1),
5238                                                   tcu::IVec3(2, 7, 3), computePipelineConstructionType));
5239 
5240     // Atomic operation with shared var
5241     basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx, "shared_atomic_op_single_invocation",
5242                                                           tcu::IVec3(1, 1, 1), tcu::IVec3(1, 1, 1),
5243                                                           computePipelineConstructionType));
5244     basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx, "shared_atomic_op_single_group", tcu::IVec3(3, 2, 5),
5245                                                           tcu::IVec3(1, 1, 1), computePipelineConstructionType));
5246     basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx, "shared_atomic_op_multiple_invocations",
5247                                                           tcu::IVec3(1, 1, 1), tcu::IVec3(2, 5, 4),
5248                                                           computePipelineConstructionType));
5249     basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx, "shared_atomic_op_multiple_groups",
5250                                                           tcu::IVec3(3, 4, 1), tcu::IVec3(2, 7, 3),
5251                                                           computePipelineConstructionType));
5252 
5253     // Image to SSBO copy
5254     basicComputeTests->addChild(new CopyImageToSSBOTest(testCtx, "copy_image_to_ssbo_small", tcu::IVec2(1, 1),
5255                                                         tcu::IVec2(64, 64), computePipelineConstructionType));
5256     basicComputeTests->addChild(new CopyImageToSSBOTest(testCtx, "copy_image_to_ssbo_large", tcu::IVec2(2, 4),
5257                                                         tcu::IVec2(512, 512), computePipelineConstructionType));
5258 
5259     // SSBO to image copy
5260     basicComputeTests->addChild(new CopySSBOToImageTest(testCtx, "copy_ssbo_to_image_small", tcu::IVec2(1, 1),
5261                                                         tcu::IVec2(64, 64), computePipelineConstructionType));
5262     basicComputeTests->addChild(new CopySSBOToImageTest(testCtx, "copy_ssbo_to_image_large", tcu::IVec2(2, 4),
5263                                                         tcu::IVec2(512, 512), computePipelineConstructionType));
5264 
5265     // Atomic operation with image
5266     basicComputeTests->addChild(new ImageAtomicOpTest(testCtx, "image_atomic_op_local_size_1", 1, tcu::IVec2(64, 64),
5267                                                       computePipelineConstructionType));
5268     basicComputeTests->addChild(new ImageAtomicOpTest(testCtx, "image_atomic_op_local_size_8", 8, tcu::IVec2(64, 64),
5269                                                       computePipelineConstructionType));
5270 
5271     // Image barrier
5272     basicComputeTests->addChild(
5273         new ImageBarrierTest(testCtx, "image_barrier_single", tcu::IVec2(1, 1), computePipelineConstructionType));
5274     basicComputeTests->addChild(
5275         new ImageBarrierTest(testCtx, "image_barrier_multiple", tcu::IVec2(64, 64), computePipelineConstructionType));
5276 
5277     // Test secondary command buffers in compute only queues
5278     basicComputeTests->addChild(
5279         new ComputeOnlyQueueTests::SecondaryCommandBufferComputeOnlyTest(testCtx, "secondary_compute_only_queue"));
5280 
5281 #ifndef CTS_USES_VULKANSC
5282     for (uint32_t i = 0; i < 3; ++i)
5283     {
5284         const char *instStr[3] = {"value", "constant", "specconstant"};
5285         std::string name;
5286         name = std::string("replicated_composites_vector_") + instStr[i];
5287         basicComputeTests->addChild(
5288             new ReplicatedCompositesTest(testCtx, name.c_str(), VECTOR, (InstType)i, computePipelineConstructionType));
5289         name = std::string("replicated_composites_matrix_") + instStr[i];
5290         basicComputeTests->addChild(
5291             new ReplicatedCompositesTest(testCtx, name.c_str(), MATRIX, (InstType)i, computePipelineConstructionType));
5292         name = std::string("replicated_composites_array_") + instStr[i];
5293         basicComputeTests->addChild(
5294             new ReplicatedCompositesTest(testCtx, name.c_str(), ARRAY, (InstType)i, computePipelineConstructionType));
5295         name = std::string("replicated_composites_array_array_") + instStr[i];
5296         basicComputeTests->addChild(new ReplicatedCompositesTest(testCtx, name.c_str(), ARRAY_ARRAY, (InstType)i,
5297                                                                  computePipelineConstructionType));
5298         name = std::string("replicated_composites_struct_") + instStr[i];
5299         basicComputeTests->addChild(
5300             new ReplicatedCompositesTest(testCtx, name.c_str(), STRUCT, (InstType)i, computePipelineConstructionType));
5301         name = std::string("replicated_composites_struct_struct_") + instStr[i];
5302         basicComputeTests->addChild(new ReplicatedCompositesTest(testCtx, name.c_str(), STRUCT_STRUCT, (InstType)i,
5303                                                                  computePipelineConstructionType));
5304         name = std::string("replicated_composites_coopmat_") + instStr[i];
5305         basicComputeTests->addChild(
5306             new ReplicatedCompositesTest(testCtx, name.c_str(), COOPMAT, (InstType)i, computePipelineConstructionType));
5307     }
5308 
5309     if (!isComputePipelineConstructionTypeShaderObject(computePipelineConstructionType))
5310     {
5311         basicComputeTests->addChild(
5312             cts_amber::createAmberTestCase(testCtx, "write_ssbo_array", "", "compute", "write_ssbo_array.amber"));
5313         basicComputeTests->addChild(cts_amber::createAmberTestCase(testCtx, "atomic_barrier_sum_small", "", "compute",
5314                                                                    "atomic_barrier_sum_small.amber"));
5315         basicComputeTests->addChild(
5316             cts_amber::createAmberTestCase(testCtx, "branch_past_barrier", "", "compute", "branch_past_barrier.amber"));
5317         basicComputeTests->addChild(cts_amber::createAmberTestCase(
5318             testCtx, "webgl_spirv_loop",
5319             "Simple SPIR-V loop from a WebGL example that caused problems in some implementations", "compute",
5320             "webgl_spirv_loop.amber"));
5321 
5322         {
5323             cts_amber::AmberTestCase *testCase = cts_amber::createAmberTestCase(
5324                 testCtx, "pk_immediate", "Immediate/inline arguments to packed 16-bit operations", "compute",
5325                 "pk-immediate.amber");
5326             testCase->addRequirement("Storage16BitFeatures.storageBuffer16BitAccess");
5327             testCase->addRequirement("Float16Int8Features.shaderFloat16");
5328             testCase->addRequirement("Features.shaderInt16");
5329             testCase->addPropertyRequirement("FloatControlsProperties.shaderDenormPreserveFloat16");
5330             basicComputeTests->addChild(testCase);
5331         }
5332 
5333         {
5334             cts_amber::AmberTestCase *testCase = cts_amber::createAmberTestCase(
5335                 testCtx, "pkadd_immediate", "Immediate/inline arguments to packed 16-bit operations", "compute",
5336                 "pkadd-immediate.amber");
5337             testCase->addRequirement("Features.shaderInt16");
5338             testCase->addRequirement("Storage16BitFeatures.storageBuffer16BitAccess");
5339             basicComputeTests->addChild(testCase);
5340         }
5341     }
5342 #endif // ifndef CTS_USES_VULKANSC
5343 
5344     return basicComputeTests.release();
5345 }
5346 
createBasicDeviceGroupComputeShaderTests(tcu::TestContext & testCtx,vk::ComputePipelineConstructionType computePipelineConstructionType)5347 tcu::TestCaseGroup *createBasicDeviceGroupComputeShaderTests(
5348     tcu::TestContext &testCtx, vk::ComputePipelineConstructionType computePipelineConstructionType)
5349 {
5350     de::MovePtr<tcu::TestCaseGroup> deviceGroupComputeTests(new tcu::TestCaseGroup(testCtx, "device_group"));
5351 
5352     deviceGroupComputeTests->addChild(new DispatchBaseTest(testCtx, "dispatch_base", 32768, tcu::IVec3(4, 2, 4),
5353                                                            tcu::IVec3(16, 8, 8), tcu::IVec3(4, 8, 8),
5354                                                            computePipelineConstructionType, false));
5355 #ifndef CTS_USES_VULKANSC
5356     deviceGroupComputeTests->addChild(new DispatchBaseTest(testCtx, "dispatch_base_maintenance5", 32768,
5357                                                            tcu::IVec3(4, 2, 4), tcu::IVec3(16, 8, 8),
5358                                                            tcu::IVec3(4, 8, 8), computePipelineConstructionType, true));
5359 #endif
5360     deviceGroupComputeTests->addChild(new DeviceIndexTest(testCtx, "device_index", 96, tcu::IVec3(3, 2, 1),
5361                                                           tcu::IVec3(2, 4, 1), computePipelineConstructionType));
5362 
5363     return deviceGroupComputeTests.release();
5364 }
5365 } // namespace compute
5366 } // namespace vkt
5367