1 /*------------------------------------------------------------------------
2 * Vulkan Conformance Tests
3 * ------------------------
4 *
5 * Copyright (c) 2019 The Khronos Group Inc.
6 * Copyright (c) 2019 The Android Open Source Project
7 *
8 * Licensed under the Apache License, Version 2.0 (the "License");
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
11 *
12 * http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
19 *
20 *//*!
21 * \file
22 * \brief Compute Shader Tests
23 *//*--------------------------------------------------------------------*/
24
25 #include "vktComputeBasicComputeShaderTests.hpp"
26 #include "vktTestCase.hpp"
27 #include "vktTestCaseUtil.hpp"
28 #include "vktComputeTestsUtil.hpp"
29 #include "vktCustomInstancesDevices.hpp"
30 #include "vktAmberTestCase.hpp"
31
32 #include "vkDefs.hpp"
33 #include "vkRef.hpp"
34 #include "vkRefUtil.hpp"
35 #include "vkPlatform.hpp"
36 #include "vkPrograms.hpp"
37 #include "vkRefUtil.hpp"
38 #include "vkMemUtil.hpp"
39 #include "vkBarrierUtil.hpp"
40 #include "vkQueryUtil.hpp"
41 #include "vkBuilderUtil.hpp"
42 #include "vkTypeUtil.hpp"
43 #include "vkDeviceUtil.hpp"
44 #include "vkCmdUtil.hpp"
45 #include "vkObjUtil.hpp"
46 #include "vkBufferWithMemory.hpp"
47 #include "vkSafetyCriticalUtil.hpp"
48 #include "vkImageWithMemory.hpp"
49
50 #include "tcuCommandLine.hpp"
51 #include "tcuTestLog.hpp"
52
53 #include "deStringUtil.hpp"
54 #include "deUniquePtr.hpp"
55 #include "deRandom.hpp"
56
57 #include <vector>
58 #include <memory>
59
60 using namespace vk;
61
62 namespace vkt
63 {
64 namespace compute
65 {
66 namespace
67 {
68
69 template<typename T, int size>
multiplyComponents(const tcu::Vector<T,size> & v)70 T multiplyComponents (const tcu::Vector<T, size>& v)
71 {
72 T accum = 1;
73 for (int i = 0; i < size; ++i)
74 accum *= v[i];
75 return accum;
76 }
77
78 template<typename T>
squared(const T & a)79 inline T squared (const T& a)
80 {
81 return a * a;
82 }
83
make2DImageCreateInfo(const tcu::IVec2 & imageSize,const VkImageUsageFlags usage)84 inline VkImageCreateInfo make2DImageCreateInfo (const tcu::IVec2& imageSize, const VkImageUsageFlags usage)
85 {
86 const VkImageCreateInfo imageParams =
87 {
88 VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // VkStructureType sType;
89 DE_NULL, // const void* pNext;
90 0u, // VkImageCreateFlags flags;
91 VK_IMAGE_TYPE_2D, // VkImageType imageType;
92 VK_FORMAT_R32_UINT, // VkFormat format;
93 vk::makeExtent3D(imageSize.x(), imageSize.y(), 1), // VkExtent3D extent;
94 1u, // deUint32 mipLevels;
95 1u, // deUint32 arrayLayers;
96 VK_SAMPLE_COUNT_1_BIT, // VkSampleCountFlagBits samples;
97 VK_IMAGE_TILING_OPTIMAL, // VkImageTiling tiling;
98 usage, // VkImageUsageFlags usage;
99 VK_SHARING_MODE_EXCLUSIVE, // VkSharingMode sharingMode;
100 0u, // deUint32 queueFamilyIndexCount;
101 DE_NULL, // const deUint32* pQueueFamilyIndices;
102 VK_IMAGE_LAYOUT_UNDEFINED, // VkImageLayout initialLayout;
103 };
104 return imageParams;
105 }
106
makeBufferImageCopy(const tcu::IVec2 & imageSize)107 inline VkBufferImageCopy makeBufferImageCopy(const tcu::IVec2& imageSize)
108 {
109 return compute::makeBufferImageCopy(vk::makeExtent3D(imageSize.x(), imageSize.y(), 1), 1u);
110 }
111
112 enum BufferType
113 {
114 BUFFER_TYPE_UNIFORM,
115 BUFFER_TYPE_SSBO,
116 };
117
118 class SharedVarTest : public vkt::TestCase
119 {
120 public:
121 SharedVarTest (tcu::TestContext& testCtx,
122 const std::string& name,
123 const std::string& description,
124 const tcu::IVec3& localSize,
125 const tcu::IVec3& workSize);
126
127 void initPrograms (SourceCollections& sourceCollections) const;
128 TestInstance* createInstance (Context& context) const;
129
130 private:
131 const tcu::IVec3 m_localSize;
132 const tcu::IVec3 m_workSize;
133 };
134
135 class SharedVarTestInstance : public vkt::TestInstance
136 {
137 public:
138 SharedVarTestInstance (Context& context,
139 const tcu::IVec3& localSize,
140 const tcu::IVec3& workSize);
141
142 tcu::TestStatus iterate (void);
143
144 private:
145 const tcu::IVec3 m_localSize;
146 const tcu::IVec3 m_workSize;
147 };
148
SharedVarTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)149 SharedVarTest::SharedVarTest (tcu::TestContext& testCtx,
150 const std::string& name,
151 const std::string& description,
152 const tcu::IVec3& localSize,
153 const tcu::IVec3& workSize)
154 : TestCase (testCtx, name, description)
155 , m_localSize (localSize)
156 , m_workSize (workSize)
157 {
158 }
159
initPrograms(SourceCollections & sourceCollections) const160 void SharedVarTest::initPrograms (SourceCollections& sourceCollections) const
161 {
162 const int workGroupSize = multiplyComponents(m_localSize);
163 const int workGroupCount = multiplyComponents(m_workSize);
164 const int numValues = workGroupSize * workGroupCount;
165
166 std::ostringstream src;
167 src << "#version 310 es\n"
168 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
169 << "layout(binding = 0) writeonly buffer Output {\n"
170 << " uint values[" << numValues << "];\n"
171 << "} sb_out;\n\n"
172 << "shared uint offsets[" << workGroupSize << "];\n\n"
173 << "void main (void) {\n"
174 << " uint localSize = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
175 << " uint globalNdx = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
176 << " uint globalOffs = localSize*globalNdx;\n"
177 << " uint localOffs = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_LocalInvocationID.z + gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x;\n"
178 << "\n"
179 << " offsets[localSize-localOffs-1u] = globalOffs + localOffs*localOffs;\n"
180 << " memoryBarrierShared();\n"
181 << " barrier();\n"
182 << " sb_out.values[globalOffs + localOffs] = offsets[localOffs];\n"
183 << "}\n";
184
185 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
186 }
187
createInstance(Context & context) const188 TestInstance* SharedVarTest::createInstance (Context& context) const
189 {
190 return new SharedVarTestInstance(context, m_localSize, m_workSize);
191 }
192
SharedVarTestInstance(Context & context,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)193 SharedVarTestInstance::SharedVarTestInstance (Context& context, const tcu::IVec3& localSize, const tcu::IVec3& workSize)
194 : TestInstance (context)
195 , m_localSize (localSize)
196 , m_workSize (workSize)
197 {
198 }
199
iterate(void)200 tcu::TestStatus SharedVarTestInstance::iterate (void)
201 {
202 const DeviceInterface& vk = m_context.getDeviceInterface();
203 const VkDevice device = m_context.getDevice();
204 const VkQueue queue = m_context.getUniversalQueue();
205 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
206 Allocator& allocator = m_context.getDefaultAllocator();
207
208 const int workGroupSize = multiplyComponents(m_localSize);
209 const int workGroupCount = multiplyComponents(m_workSize);
210
211 // Create a buffer and host-visible memory for it
212
213 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * workGroupSize * workGroupCount;
214 const BufferWithMemory buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
215
216 // Create descriptor set
217
218 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
219 DescriptorSetLayoutBuilder()
220 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
221 .build(vk, device));
222
223 const Unique<VkDescriptorPool> descriptorPool(
224 DescriptorPoolBuilder()
225 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
226 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
227
228 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
229
230 const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
231 DescriptorSetUpdateBuilder()
232 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
233 .update(vk, device);
234
235 // Perform the computation
236
237 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
238 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
239 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
240
241 const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
242
243 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
244 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
245
246 // Start recording commands
247
248 beginCommandBuffer(vk, *cmdBuffer);
249
250 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
251 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
252
253 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
254
255 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
256
257 endCommandBuffer(vk, *cmdBuffer);
258
259 // Wait for completion
260
261 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
262
263 // Validate the results
264
265 const Allocation& bufferAllocation = buffer.getAllocation();
266 invalidateAlloc(vk, device, bufferAllocation);
267
268 const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
269
270 for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
271 {
272 const int globalOffset = groupNdx * workGroupSize;
273 for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
274 {
275 const deUint32 res = bufferPtr[globalOffset + localOffset];
276 const deUint32 ref = globalOffset + squared(workGroupSize - localOffset - 1);
277
278 if (res != ref)
279 {
280 std::ostringstream msg;
281 msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
282 return tcu::TestStatus::fail(msg.str());
283 }
284 }
285 }
286 return tcu::TestStatus::pass("Compute succeeded");
287 }
288
289 class SharedVarAtomicOpTest : public vkt::TestCase
290 {
291 public:
292 SharedVarAtomicOpTest (tcu::TestContext& testCtx,
293 const std::string& name,
294 const std::string& description,
295 const tcu::IVec3& localSize,
296 const tcu::IVec3& workSize);
297
298 void initPrograms (SourceCollections& sourceCollections) const;
299 TestInstance* createInstance (Context& context) const;
300
301 private:
302 const tcu::IVec3 m_localSize;
303 const tcu::IVec3 m_workSize;
304 };
305
306 class SharedVarAtomicOpTestInstance : public vkt::TestInstance
307 {
308 public:
309 SharedVarAtomicOpTestInstance (Context& context,
310 const tcu::IVec3& localSize,
311 const tcu::IVec3& workSize);
312
313 tcu::TestStatus iterate (void);
314
315 private:
316 const tcu::IVec3 m_localSize;
317 const tcu::IVec3 m_workSize;
318 };
319
SharedVarAtomicOpTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)320 SharedVarAtomicOpTest::SharedVarAtomicOpTest (tcu::TestContext& testCtx,
321 const std::string& name,
322 const std::string& description,
323 const tcu::IVec3& localSize,
324 const tcu::IVec3& workSize)
325 : TestCase (testCtx, name, description)
326 , m_localSize (localSize)
327 , m_workSize (workSize)
328 {
329 }
330
initPrograms(SourceCollections & sourceCollections) const331 void SharedVarAtomicOpTest::initPrograms (SourceCollections& sourceCollections) const
332 {
333 const int workGroupSize = multiplyComponents(m_localSize);
334 const int workGroupCount = multiplyComponents(m_workSize);
335 const int numValues = workGroupSize * workGroupCount;
336
337 std::ostringstream src;
338 src << "#version 310 es\n"
339 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
340 << "layout(binding = 0) writeonly buffer Output {\n"
341 << " uint values[" << numValues << "];\n"
342 << "} sb_out;\n\n"
343 << "shared uint count;\n\n"
344 << "void main (void) {\n"
345 << " uint localSize = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
346 << " uint globalNdx = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
347 << " uint globalOffs = localSize*globalNdx;\n"
348 << "\n"
349 << " count = 0u;\n"
350 << " memoryBarrierShared();\n"
351 << " barrier();\n"
352 << " uint oldVal = atomicAdd(count, 1u);\n"
353 << " sb_out.values[globalOffs+oldVal] = oldVal+1u;\n"
354 << "}\n";
355
356 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
357 }
358
createInstance(Context & context) const359 TestInstance* SharedVarAtomicOpTest::createInstance (Context& context) const
360 {
361 return new SharedVarAtomicOpTestInstance(context, m_localSize, m_workSize);
362 }
363
SharedVarAtomicOpTestInstance(Context & context,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)364 SharedVarAtomicOpTestInstance::SharedVarAtomicOpTestInstance (Context& context, const tcu::IVec3& localSize, const tcu::IVec3& workSize)
365 : TestInstance (context)
366 , m_localSize (localSize)
367 , m_workSize (workSize)
368 {
369 }
370
iterate(void)371 tcu::TestStatus SharedVarAtomicOpTestInstance::iterate (void)
372 {
373 const DeviceInterface& vk = m_context.getDeviceInterface();
374 const VkDevice device = m_context.getDevice();
375 const VkQueue queue = m_context.getUniversalQueue();
376 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
377 Allocator& allocator = m_context.getDefaultAllocator();
378
379 const int workGroupSize = multiplyComponents(m_localSize);
380 const int workGroupCount = multiplyComponents(m_workSize);
381
382 // Create a buffer and host-visible memory for it
383
384 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * workGroupSize * workGroupCount;
385 const BufferWithMemory buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
386
387 // Create descriptor set
388
389 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
390 DescriptorSetLayoutBuilder()
391 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
392 .build(vk, device));
393
394 const Unique<VkDescriptorPool> descriptorPool(
395 DescriptorPoolBuilder()
396 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
397 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
398
399 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
400
401 const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
402 DescriptorSetUpdateBuilder()
403 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
404 .update(vk, device);
405
406 // Perform the computation
407
408 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
409 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
410 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
411
412 const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
413
414 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
415 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
416
417 // Start recording commands
418
419 beginCommandBuffer(vk, *cmdBuffer);
420
421 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
422 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
423
424 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
425
426 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1u, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
427
428 endCommandBuffer(vk, *cmdBuffer);
429
430 // Wait for completion
431
432 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
433
434 // Validate the results
435
436 const Allocation& bufferAllocation = buffer.getAllocation();
437 invalidateAlloc(vk, device, bufferAllocation);
438
439 const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
440
441 for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
442 {
443 const int globalOffset = groupNdx * workGroupSize;
444 for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
445 {
446 const deUint32 res = bufferPtr[globalOffset + localOffset];
447 const deUint32 ref = localOffset + 1;
448
449 if (res != ref)
450 {
451 std::ostringstream msg;
452 msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
453 return tcu::TestStatus::fail(msg.str());
454 }
455 }
456 }
457 return tcu::TestStatus::pass("Compute succeeded");
458 }
459
460 class SSBOLocalBarrierTest : public vkt::TestCase
461 {
462 public:
463 SSBOLocalBarrierTest (tcu::TestContext& testCtx,
464 const std::string& name,
465 const std::string& description,
466 const tcu::IVec3& localSize,
467 const tcu::IVec3& workSize);
468
469 void initPrograms (SourceCollections& sourceCollections) const;
470 TestInstance* createInstance (Context& context) const;
471
472 private:
473 const tcu::IVec3 m_localSize;
474 const tcu::IVec3 m_workSize;
475 };
476
477 class SSBOLocalBarrierTestInstance : public vkt::TestInstance
478 {
479 public:
480 SSBOLocalBarrierTestInstance (Context& context,
481 const tcu::IVec3& localSize,
482 const tcu::IVec3& workSize);
483
484 tcu::TestStatus iterate (void);
485
486 private:
487 const tcu::IVec3 m_localSize;
488 const tcu::IVec3 m_workSize;
489 };
490
SSBOLocalBarrierTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)491 SSBOLocalBarrierTest::SSBOLocalBarrierTest (tcu::TestContext& testCtx,
492 const std::string& name,
493 const std::string& description,
494 const tcu::IVec3& localSize,
495 const tcu::IVec3& workSize)
496 : TestCase (testCtx, name, description)
497 , m_localSize (localSize)
498 , m_workSize (workSize)
499 {
500 }
501
initPrograms(SourceCollections & sourceCollections) const502 void SSBOLocalBarrierTest::initPrograms (SourceCollections& sourceCollections) const
503 {
504 const int workGroupSize = multiplyComponents(m_localSize);
505 const int workGroupCount = multiplyComponents(m_workSize);
506 const int numValues = workGroupSize * workGroupCount;
507
508 std::ostringstream src;
509 src << "#version 310 es\n"
510 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
511 << "layout(binding = 0) coherent buffer Output {\n"
512 << " uint values[" << numValues << "];\n"
513 << "} sb_out;\n\n"
514 << "void main (void) {\n"
515 << " uint localSize = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
516 << " uint globalNdx = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
517 << " uint globalOffs = localSize*globalNdx;\n"
518 << " uint localOffs = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_LocalInvocationID.z + gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x;\n"
519 << "\n"
520 << " sb_out.values[globalOffs + localOffs] = globalOffs;\n"
521 << " memoryBarrierBuffer();\n"
522 << " barrier();\n"
523 << " sb_out.values[globalOffs + ((localOffs+1u)%localSize)] += localOffs;\n" // += so we read and write
524 << " memoryBarrierBuffer();\n"
525 << " barrier();\n"
526 << " sb_out.values[globalOffs + ((localOffs+2u)%localSize)] += localOffs;\n"
527 << "}\n";
528
529 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
530 }
531
createInstance(Context & context) const532 TestInstance* SSBOLocalBarrierTest::createInstance (Context& context) const
533 {
534 return new SSBOLocalBarrierTestInstance(context, m_localSize, m_workSize);
535 }
536
SSBOLocalBarrierTestInstance(Context & context,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)537 SSBOLocalBarrierTestInstance::SSBOLocalBarrierTestInstance (Context& context, const tcu::IVec3& localSize, const tcu::IVec3& workSize)
538 : TestInstance (context)
539 , m_localSize (localSize)
540 , m_workSize (workSize)
541 {
542 }
543
iterate(void)544 tcu::TestStatus SSBOLocalBarrierTestInstance::iterate (void)
545 {
546 const DeviceInterface& vk = m_context.getDeviceInterface();
547 const VkDevice device = m_context.getDevice();
548 const VkQueue queue = m_context.getUniversalQueue();
549 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
550 Allocator& allocator = m_context.getDefaultAllocator();
551
552 const int workGroupSize = multiplyComponents(m_localSize);
553 const int workGroupCount = multiplyComponents(m_workSize);
554
555 // Create a buffer and host-visible memory for it
556
557 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * workGroupSize * workGroupCount;
558 const BufferWithMemory buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
559
560 // Create descriptor set
561
562 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
563 DescriptorSetLayoutBuilder()
564 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
565 .build(vk, device));
566
567 const Unique<VkDescriptorPool> descriptorPool(
568 DescriptorPoolBuilder()
569 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
570 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
571
572 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
573
574 const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
575 DescriptorSetUpdateBuilder()
576 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
577 .update(vk, device);
578
579 // Perform the computation
580
581 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
582 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
583 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
584
585 const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
586
587 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
588 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
589
590 // Start recording commands
591
592 beginCommandBuffer(vk, *cmdBuffer);
593
594 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
595 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
596
597 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
598
599 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
600
601 endCommandBuffer(vk, *cmdBuffer);
602
603 // Wait for completion
604
605 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
606
607 // Validate the results
608
609 const Allocation& bufferAllocation = buffer.getAllocation();
610 invalidateAlloc(vk, device, bufferAllocation);
611
612 const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
613
614 for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
615 {
616 const int globalOffset = groupNdx * workGroupSize;
617 for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
618 {
619 const deUint32 res = bufferPtr[globalOffset + localOffset];
620 const int offs0 = localOffset - 1 < 0 ? ((localOffset + workGroupSize - 1) % workGroupSize) : ((localOffset - 1) % workGroupSize);
621 const int offs1 = localOffset - 2 < 0 ? ((localOffset + workGroupSize - 2) % workGroupSize) : ((localOffset - 2) % workGroupSize);
622 const deUint32 ref = static_cast<deUint32>(globalOffset + offs0 + offs1);
623
624 if (res != ref)
625 {
626 std::ostringstream msg;
627 msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
628 return tcu::TestStatus::fail(msg.str());
629 }
630 }
631 }
632 return tcu::TestStatus::pass("Compute succeeded");
633 }
634
635 class CopyImageToSSBOTest : public vkt::TestCase
636 {
637 public:
638 CopyImageToSSBOTest (tcu::TestContext& testCtx,
639 const std::string& name,
640 const std::string& description,
641 const tcu::IVec2& localSize,
642 const tcu::IVec2& imageSize);
643
644 void initPrograms (SourceCollections& sourceCollections) const;
645 TestInstance* createInstance (Context& context) const;
646
647 private:
648 const tcu::IVec2 m_localSize;
649 const tcu::IVec2 m_imageSize;
650 };
651
652 class CopyImageToSSBOTestInstance : public vkt::TestInstance
653 {
654 public:
655 CopyImageToSSBOTestInstance (Context& context,
656 const tcu::IVec2& localSize,
657 const tcu::IVec2& imageSize);
658
659 tcu::TestStatus iterate (void);
660
661 private:
662 const tcu::IVec2 m_localSize;
663 const tcu::IVec2 m_imageSize;
664 };
665
CopyImageToSSBOTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize)666 CopyImageToSSBOTest::CopyImageToSSBOTest (tcu::TestContext& testCtx,
667 const std::string& name,
668 const std::string& description,
669 const tcu::IVec2& localSize,
670 const tcu::IVec2& imageSize)
671 : TestCase (testCtx, name, description)
672 , m_localSize (localSize)
673 , m_imageSize (imageSize)
674 {
675 DE_ASSERT(m_imageSize.x() % m_localSize.x() == 0);
676 DE_ASSERT(m_imageSize.y() % m_localSize.y() == 0);
677 }
678
initPrograms(SourceCollections & sourceCollections) const679 void CopyImageToSSBOTest::initPrograms (SourceCollections& sourceCollections) const
680 {
681 std::ostringstream src;
682 src << "#version 310 es\n"
683 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ") in;\n"
684 << "layout(binding = 1, r32ui) readonly uniform highp uimage2D u_srcImg;\n"
685 << "layout(binding = 0) writeonly buffer Output {\n"
686 << " uint values[" << (m_imageSize.x() * m_imageSize.y()) << "];\n"
687 << "} sb_out;\n\n"
688 << "void main (void) {\n"
689 << " uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
690 << " uint value = imageLoad(u_srcImg, ivec2(gl_GlobalInvocationID.xy)).x;\n"
691 << " sb_out.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x] = value;\n"
692 << "}\n";
693
694 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
695 }
696
createInstance(Context & context) const697 TestInstance* CopyImageToSSBOTest::createInstance (Context& context) const
698 {
699 return new CopyImageToSSBOTestInstance(context, m_localSize, m_imageSize);
700 }
701
CopyImageToSSBOTestInstance(Context & context,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize)702 CopyImageToSSBOTestInstance::CopyImageToSSBOTestInstance (Context& context, const tcu::IVec2& localSize, const tcu::IVec2& imageSize)
703 : TestInstance (context)
704 , m_localSize (localSize)
705 , m_imageSize (imageSize)
706 {
707 }
708
iterate(void)709 tcu::TestStatus CopyImageToSSBOTestInstance::iterate (void)
710 {
711 const DeviceInterface& vk = m_context.getDeviceInterface();
712 const VkDevice device = m_context.getDevice();
713 const VkQueue queue = m_context.getUniversalQueue();
714 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
715 Allocator& allocator = m_context.getDefaultAllocator();
716
717 // Create an image
718
719 const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
720 const ImageWithMemory image(vk, device, allocator, imageParams, MemoryRequirement::Any);
721
722 const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
723 const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
724
725 // Staging buffer (source data for image)
726
727 const deUint32 imageArea = multiplyComponents(m_imageSize);
728 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * imageArea;
729
730 const BufferWithMemory stagingBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_SRC_BIT), MemoryRequirement::HostVisible);
731
732 // Populate the staging buffer with test data
733 {
734 de::Random rnd(0xab2c7);
735 const Allocation& stagingBufferAllocation = stagingBuffer.getAllocation();
736 deUint32* bufferPtr = static_cast<deUint32*>(stagingBufferAllocation.getHostPtr());
737 for (deUint32 i = 0; i < imageArea; ++i)
738 *bufferPtr++ = rnd.getUint32();
739
740 flushAlloc(vk, device, stagingBufferAllocation);
741 }
742
743 // Create a buffer to store shader output
744
745 const BufferWithMemory outputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
746
747 // Create descriptor set
748
749 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
750 DescriptorSetLayoutBuilder()
751 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
752 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
753 .build(vk, device));
754
755 const Unique<VkDescriptorPool> descriptorPool(
756 DescriptorPoolBuilder()
757 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
758 .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
759 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
760
761 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
762
763 // Set the bindings
764
765 const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
766 const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, bufferSizeBytes);
767
768 DescriptorSetUpdateBuilder()
769 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
770 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
771 .update(vk, device);
772
773 // Perform the computation
774 {
775 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
776 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
777 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
778
779 const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, bufferSizeBytes);
780 const tcu::IVec2 workSize = m_imageSize / m_localSize;
781
782 // Prepare the command buffer
783
784 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
785 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
786
787 // Start recording commands
788
789 beginCommandBuffer(vk, *cmdBuffer);
790
791 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
792 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
793
794 const std::vector<VkBufferImageCopy> bufferImageCopy(1, makeBufferImageCopy(m_imageSize));
795 copyBufferToImage(vk, *cmdBuffer, *stagingBuffer, bufferSizeBytes, bufferImageCopy, VK_IMAGE_ASPECT_COLOR_BIT, 1, 1, *image, VK_IMAGE_LAYOUT_GENERAL, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
796
797 vk.cmdDispatch(*cmdBuffer, workSize.x(), workSize.y(), 1u);
798 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
799
800 endCommandBuffer(vk, *cmdBuffer);
801
802 // Wait for completion
803
804 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
805 }
806
807 // Validate the results
808
809 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
810 invalidateAlloc(vk, device, outputBufferAllocation);
811
812 const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
813 const deUint32* refBufferPtr = static_cast<deUint32*>(stagingBuffer.getAllocation().getHostPtr());
814
815 for (deUint32 ndx = 0; ndx < imageArea; ++ndx)
816 {
817 const deUint32 res = *(bufferPtr + ndx);
818 const deUint32 ref = *(refBufferPtr + ndx);
819
820 if (res != ref)
821 {
822 std::ostringstream msg;
823 msg << "Comparison failed for Output.values[" << ndx << "]";
824 return tcu::TestStatus::fail(msg.str());
825 }
826 }
827 return tcu::TestStatus::pass("Compute succeeded");
828 }
829
830 class CopySSBOToImageTest : public vkt::TestCase
831 {
832 public:
833 CopySSBOToImageTest (tcu::TestContext& testCtx,
834 const std::string& name,
835 const std::string& description,
836 const tcu::IVec2& localSize,
837 const tcu::IVec2& imageSize);
838
839 void initPrograms (SourceCollections& sourceCollections) const;
840 TestInstance* createInstance (Context& context) const;
841
842 private:
843 const tcu::IVec2 m_localSize;
844 const tcu::IVec2 m_imageSize;
845 };
846
847 class CopySSBOToImageTestInstance : public vkt::TestInstance
848 {
849 public:
850 CopySSBOToImageTestInstance (Context& context,
851 const tcu::IVec2& localSize,
852 const tcu::IVec2& imageSize);
853
854 tcu::TestStatus iterate (void);
855
856 private:
857 const tcu::IVec2 m_localSize;
858 const tcu::IVec2 m_imageSize;
859 };
860
CopySSBOToImageTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize)861 CopySSBOToImageTest::CopySSBOToImageTest (tcu::TestContext& testCtx,
862 const std::string& name,
863 const std::string& description,
864 const tcu::IVec2& localSize,
865 const tcu::IVec2& imageSize)
866 : TestCase (testCtx, name, description)
867 , m_localSize (localSize)
868 , m_imageSize (imageSize)
869 {
870 DE_ASSERT(m_imageSize.x() % m_localSize.x() == 0);
871 DE_ASSERT(m_imageSize.y() % m_localSize.y() == 0);
872 }
873
initPrograms(SourceCollections & sourceCollections) const874 void CopySSBOToImageTest::initPrograms (SourceCollections& sourceCollections) const
875 {
876 std::ostringstream src;
877 src << "#version 310 es\n"
878 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ") in;\n"
879 << "layout(binding = 1, r32ui) writeonly uniform highp uimage2D u_dstImg;\n"
880 << "layout(binding = 0) readonly buffer Input {\n"
881 << " uint values[" << (m_imageSize.x() * m_imageSize.y()) << "];\n"
882 << "} sb_in;\n\n"
883 << "void main (void) {\n"
884 << " uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
885 << " uint value = sb_in.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x];\n"
886 << " imageStore(u_dstImg, ivec2(gl_GlobalInvocationID.xy), uvec4(value, 0, 0, 0));\n"
887 << "}\n";
888
889 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
890 }
891
createInstance(Context & context) const892 TestInstance* CopySSBOToImageTest::createInstance (Context& context) const
893 {
894 return new CopySSBOToImageTestInstance(context, m_localSize, m_imageSize);
895 }
896
CopySSBOToImageTestInstance(Context & context,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize)897 CopySSBOToImageTestInstance::CopySSBOToImageTestInstance (Context& context, const tcu::IVec2& localSize, const tcu::IVec2& imageSize)
898 : TestInstance (context)
899 , m_localSize (localSize)
900 , m_imageSize (imageSize)
901 {
902 }
903
iterate(void)904 tcu::TestStatus CopySSBOToImageTestInstance::iterate (void)
905 {
906 ContextCommonData data = m_context.getContextCommonData();
907 const DeviceInterface& vkd = data.vkd;
908
909 // Create an image, a view, and the output buffer
910 const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
911 ImageWithBuffer imageWithBuffer(vkd, data.device, data.allocator, vk::makeExtent3D(m_imageSize.x(), m_imageSize.y(), 1),
912 VK_FORMAT_R32_UINT, VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_STORAGE_BIT, vk::VK_IMAGE_TYPE_2D,
913 subresourceRange);
914
915 const deUint32 imageArea = multiplyComponents(m_imageSize);
916 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * imageArea;
917
918 const BufferWithMemory inputBuffer(vkd, data.device, data.allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
919
920 // Populate the buffer with test data
921 {
922 de::Random rnd(0x77238ac2);
923 const Allocation& inputBufferAllocation = inputBuffer.getAllocation();
924 deUint32* bufferPtr = static_cast<deUint32*>(inputBufferAllocation.getHostPtr());
925 for (deUint32 i = 0; i < imageArea; ++i)
926 *bufferPtr++ = rnd.getUint32();
927
928 flushAlloc(vkd, data.device, inputBufferAllocation);
929 }
930
931 // Create descriptor set
932 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
933 DescriptorSetLayoutBuilder()
934 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
935 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
936 .build(vkd, data.device));
937
938 const Unique<VkDescriptorPool> descriptorPool(
939 DescriptorPoolBuilder()
940 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
941 .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
942 .build(vkd, data.device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
943
944 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vkd, data.device, *descriptorPool, *descriptorSetLayout));
945
946 // Set the bindings
947
948 const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, imageWithBuffer.getImageView(), VK_IMAGE_LAYOUT_GENERAL);
949 const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, bufferSizeBytes);
950
951 DescriptorSetUpdateBuilder()
952 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
953 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
954 .update(vkd, data.device);
955
956 // Perform the computation
957 {
958 const Unique<VkShaderModule> shaderModule(createShaderModule(vkd, data.device, m_context.getBinaryCollection().get("comp"), 0u));
959 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vkd, data.device, *descriptorSetLayout));
960 const Unique<VkPipeline> pipeline(makeComputePipeline(vkd, data.device, *pipelineLayout, *shaderModule));
961
962 const VkBufferMemoryBarrier inputBufferPostHostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, bufferSizeBytes);
963
964 const VkImageMemoryBarrier imageLayoutBarrier = makeImageMemoryBarrier(
965 0u, VK_ACCESS_SHADER_WRITE_BIT,
966 VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
967 imageWithBuffer.getImage(), subresourceRange);
968
969 const tcu::IVec2 workSize = m_imageSize / m_localSize;
970
971 // Prepare the command buffer
972
973 const Unique<VkCommandPool> cmdPool(makeCommandPool(vkd, data.device, data.qfIndex));
974 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vkd, data.device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
975
976 // Start recording commands
977
978 beginCommandBuffer(vkd, *cmdBuffer);
979
980 vkd.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
981 vkd.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
982
983 vkd.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &inputBufferPostHostWriteBarrier, 1, &imageLayoutBarrier);
984 vkd.cmdDispatch(*cmdBuffer, workSize.x(), workSize.y(), 1u);
985
986 copyImageToBuffer(vkd, *cmdBuffer, imageWithBuffer.getImage(), imageWithBuffer.getBuffer(), m_imageSize, VK_ACCESS_SHADER_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL);
987
988 endCommandBuffer(vkd, *cmdBuffer);
989
990 // Wait for completion
991
992 submitCommandsAndWait(vkd, data.device, data.queue, *cmdBuffer);
993 }
994
995 // Validate the results
996
997 const Allocation& outputBufferAllocation = imageWithBuffer.getBufferAllocation();
998 invalidateAlloc(vkd, data.device, outputBufferAllocation);
999
1000 const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
1001 const deUint32* refBufferPtr = static_cast<deUint32*>(inputBuffer.getAllocation().getHostPtr());
1002
1003 for (deUint32 ndx = 0; ndx < imageArea; ++ndx)
1004 {
1005 const deUint32 res = *(bufferPtr + ndx);
1006 const deUint32 ref = *(refBufferPtr + ndx);
1007
1008 if (res != ref)
1009 {
1010 std::ostringstream msg;
1011 msg << "Comparison failed for pixel " << ndx;
1012 return tcu::TestStatus::fail(msg.str());
1013 }
1014 }
1015 return tcu::TestStatus::pass("Compute succeeded");
1016 }
1017
1018 class BufferToBufferInvertTest : public vkt::TestCase
1019 {
1020 public:
1021 void initPrograms (SourceCollections& sourceCollections) const;
1022 TestInstance* createInstance (Context& context) const;
1023
1024 static BufferToBufferInvertTest* UBOToSSBOInvertCase (tcu::TestContext& testCtx,
1025 const std::string& name,
1026 const std::string& description,
1027 const deUint32 numValues,
1028 const tcu::IVec3& localSize,
1029 const tcu::IVec3& workSize);
1030
1031 static BufferToBufferInvertTest* CopyInvertSSBOCase (tcu::TestContext& testCtx,
1032 const std::string& name,
1033 const std::string& description,
1034 const deUint32 numValues,
1035 const tcu::IVec3& localSize,
1036 const tcu::IVec3& workSize);
1037
1038 private:
1039 BufferToBufferInvertTest (tcu::TestContext& testCtx,
1040 const std::string& name,
1041 const std::string& description,
1042 const deUint32 numValues,
1043 const tcu::IVec3& localSize,
1044 const tcu::IVec3& workSize,
1045 const BufferType bufferType);
1046
1047 const BufferType m_bufferType;
1048 const deUint32 m_numValues;
1049 const tcu::IVec3 m_localSize;
1050 const tcu::IVec3 m_workSize;
1051 };
1052
1053 class BufferToBufferInvertTestInstance : public vkt::TestInstance
1054 {
1055 public:
1056 BufferToBufferInvertTestInstance (Context& context,
1057 const deUint32 numValues,
1058 const tcu::IVec3& localSize,
1059 const tcu::IVec3& workSize,
1060 const BufferType bufferType);
1061
1062 tcu::TestStatus iterate (void);
1063
1064 private:
1065 const BufferType m_bufferType;
1066 const deUint32 m_numValues;
1067 const tcu::IVec3 m_localSize;
1068 const tcu::IVec3 m_workSize;
1069 };
1070
BufferToBufferInvertTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const BufferType bufferType)1071 BufferToBufferInvertTest::BufferToBufferInvertTest (tcu::TestContext& testCtx,
1072 const std::string& name,
1073 const std::string& description,
1074 const deUint32 numValues,
1075 const tcu::IVec3& localSize,
1076 const tcu::IVec3& workSize,
1077 const BufferType bufferType)
1078 : TestCase (testCtx, name, description)
1079 , m_bufferType (bufferType)
1080 , m_numValues (numValues)
1081 , m_localSize (localSize)
1082 , m_workSize (workSize)
1083 {
1084 DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1085 DE_ASSERT(m_bufferType == BUFFER_TYPE_UNIFORM || m_bufferType == BUFFER_TYPE_SSBO);
1086 }
1087
UBOToSSBOInvertCase(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)1088 BufferToBufferInvertTest* BufferToBufferInvertTest::UBOToSSBOInvertCase (tcu::TestContext& testCtx,
1089 const std::string& name,
1090 const std::string& description,
1091 const deUint32 numValues,
1092 const tcu::IVec3& localSize,
1093 const tcu::IVec3& workSize)
1094 {
1095 return new BufferToBufferInvertTest(testCtx, name, description, numValues, localSize, workSize, BUFFER_TYPE_UNIFORM);
1096 }
1097
CopyInvertSSBOCase(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)1098 BufferToBufferInvertTest* BufferToBufferInvertTest::CopyInvertSSBOCase (tcu::TestContext& testCtx,
1099 const std::string& name,
1100 const std::string& description,
1101 const deUint32 numValues,
1102 const tcu::IVec3& localSize,
1103 const tcu::IVec3& workSize)
1104 {
1105 return new BufferToBufferInvertTest(testCtx, name, description, numValues, localSize, workSize, BUFFER_TYPE_SSBO);
1106 }
1107
initPrograms(SourceCollections & sourceCollections) const1108 void BufferToBufferInvertTest::initPrograms (SourceCollections& sourceCollections) const
1109 {
1110 std::ostringstream src;
1111 if (m_bufferType == BUFFER_TYPE_UNIFORM)
1112 {
1113 src << "#version 310 es\n"
1114 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1115 << "layout(binding = 0) readonly uniform Input {\n"
1116 << " uint values[" << m_numValues << "];\n"
1117 << "} ub_in;\n"
1118 << "layout(binding = 1, std140) writeonly buffer Output {\n"
1119 << " uint values[" << m_numValues << "];\n"
1120 << "} sb_out;\n"
1121 << "void main (void) {\n"
1122 << " uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1123 << " uint numValuesPerInv = uint(ub_in.values.length()) / (size.x*size.y*size.z);\n"
1124 << " uint groupNdx = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1125 << " uint offset = numValuesPerInv*groupNdx;\n"
1126 << "\n"
1127 << " for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1128 << " sb_out.values[offset + ndx] = ~ub_in.values[offset + ndx];\n"
1129 << "}\n";
1130 }
1131 else if (m_bufferType == BUFFER_TYPE_SSBO)
1132 {
1133 src << "#version 310 es\n"
1134 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1135 << "layout(binding = 0, std140) readonly buffer Input {\n"
1136 << " uint values[" << m_numValues << "];\n"
1137 << "} sb_in;\n"
1138 << "layout (binding = 1, std140) writeonly buffer Output {\n"
1139 << " uint values[" << m_numValues << "];\n"
1140 << "} sb_out;\n"
1141 << "void main (void) {\n"
1142 << " uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1143 << " uint numValuesPerInv = uint(sb_in.values.length()) / (size.x*size.y*size.z);\n"
1144 << " uint groupNdx = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1145 << " uint offset = numValuesPerInv*groupNdx;\n"
1146 << "\n"
1147 << " for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1148 << " sb_out.values[offset + ndx] = ~sb_in.values[offset + ndx];\n"
1149 << "}\n";
1150 }
1151
1152 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1153 }
1154
createInstance(Context & context) const1155 TestInstance* BufferToBufferInvertTest::createInstance (Context& context) const
1156 {
1157 return new BufferToBufferInvertTestInstance(context, m_numValues, m_localSize, m_workSize, m_bufferType);
1158 }
1159
BufferToBufferInvertTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const BufferType bufferType)1160 BufferToBufferInvertTestInstance::BufferToBufferInvertTestInstance (Context& context,
1161 const deUint32 numValues,
1162 const tcu::IVec3& localSize,
1163 const tcu::IVec3& workSize,
1164 const BufferType bufferType)
1165 : TestInstance (context)
1166 , m_bufferType (bufferType)
1167 , m_numValues (numValues)
1168 , m_localSize (localSize)
1169 , m_workSize (workSize)
1170 {
1171 }
1172
iterate(void)1173 tcu::TestStatus BufferToBufferInvertTestInstance::iterate (void)
1174 {
1175 const DeviceInterface& vk = m_context.getDeviceInterface();
1176 const VkDevice device = m_context.getDevice();
1177 const VkQueue queue = m_context.getUniversalQueue();
1178 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
1179 Allocator& allocator = m_context.getDefaultAllocator();
1180
1181 // Customize the test based on buffer type
1182
1183 const VkBufferUsageFlags inputBufferUsageFlags = (m_bufferType == BUFFER_TYPE_UNIFORM ? VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT : VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
1184 const VkDescriptorType inputBufferDescriptorType = (m_bufferType == BUFFER_TYPE_UNIFORM ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1185 const deUint32 randomSeed = (m_bufferType == BUFFER_TYPE_UNIFORM ? 0x111223f : 0x124fef);
1186
1187 // Create an input buffer
1188
1189 const VkDeviceSize bufferSizeBytes = sizeof(tcu::UVec4) * m_numValues;
1190 const BufferWithMemory inputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, inputBufferUsageFlags), MemoryRequirement::HostVisible);
1191
1192 // Fill the input buffer with data
1193 {
1194 de::Random rnd(randomSeed);
1195 const Allocation& inputBufferAllocation = inputBuffer.getAllocation();
1196 tcu::UVec4* bufferPtr = static_cast<tcu::UVec4*>(inputBufferAllocation.getHostPtr());
1197 for (deUint32 i = 0; i < m_numValues; ++i)
1198 bufferPtr[i].x() = rnd.getUint32();
1199
1200 flushAlloc(vk, device, inputBufferAllocation);
1201 }
1202
1203 // Create an output buffer
1204
1205 const BufferWithMemory outputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1206
1207 // Create descriptor set
1208
1209 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1210 DescriptorSetLayoutBuilder()
1211 .addSingleBinding(inputBufferDescriptorType, VK_SHADER_STAGE_COMPUTE_BIT)
1212 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1213 .build(vk, device));
1214
1215 const Unique<VkDescriptorPool> descriptorPool(
1216 DescriptorPoolBuilder()
1217 .addType(inputBufferDescriptorType)
1218 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
1219 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1220
1221 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1222
1223 const VkDescriptorBufferInfo inputBufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, bufferSizeBytes);
1224 const VkDescriptorBufferInfo outputBufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, bufferSizeBytes);
1225 DescriptorSetUpdateBuilder()
1226 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), inputBufferDescriptorType, &inputBufferDescriptorInfo)
1227 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
1228 .update(vk, device);
1229
1230 // Perform the computation
1231
1232 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
1233 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1234 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
1235
1236 const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, bufferSizeBytes);
1237
1238 const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, bufferSizeBytes);
1239
1240 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1241 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1242
1243 // Start recording commands
1244
1245 beginCommandBuffer(vk, *cmdBuffer);
1246
1247 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1248 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1249
1250 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1251 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1252 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1253
1254 endCommandBuffer(vk, *cmdBuffer);
1255
1256 // Wait for completion
1257
1258 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1259
1260 // Validate the results
1261
1262 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1263 invalidateAlloc(vk, device, outputBufferAllocation);
1264
1265 const tcu::UVec4* bufferPtr = static_cast<tcu::UVec4*>(outputBufferAllocation.getHostPtr());
1266 const tcu::UVec4* refBufferPtr = static_cast<tcu::UVec4*>(inputBuffer.getAllocation().getHostPtr());
1267
1268 for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1269 {
1270 const deUint32 res = bufferPtr[ndx].x();
1271 const deUint32 ref = ~refBufferPtr[ndx].x();
1272
1273 if (res != ref)
1274 {
1275 std::ostringstream msg;
1276 msg << "Comparison failed for Output.values[" << ndx << "]";
1277 return tcu::TestStatus::fail(msg.str());
1278 }
1279 }
1280 return tcu::TestStatus::pass("Compute succeeded");
1281 }
1282
1283 class InvertSSBOInPlaceTest : public vkt::TestCase
1284 {
1285 public:
1286 InvertSSBOInPlaceTest (tcu::TestContext& testCtx,
1287 const std::string& name,
1288 const std::string& description,
1289 const deUint32 numValues,
1290 const bool sized,
1291 const tcu::IVec3& localSize,
1292 const tcu::IVec3& workSize);
1293
1294
1295 void initPrograms (SourceCollections& sourceCollections) const;
1296 TestInstance* createInstance (Context& context) const;
1297
1298 private:
1299 const deUint32 m_numValues;
1300 const bool m_sized;
1301 const tcu::IVec3 m_localSize;
1302 const tcu::IVec3 m_workSize;
1303 };
1304
1305 class InvertSSBOInPlaceTestInstance : public vkt::TestInstance
1306 {
1307 public:
1308 InvertSSBOInPlaceTestInstance (Context& context,
1309 const deUint32 numValues,
1310 const tcu::IVec3& localSize,
1311 const tcu::IVec3& workSize);
1312
1313 tcu::TestStatus iterate (void);
1314
1315 private:
1316 const deUint32 m_numValues;
1317 const tcu::IVec3 m_localSize;
1318 const tcu::IVec3 m_workSize;
1319 };
1320
InvertSSBOInPlaceTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const bool sized,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)1321 InvertSSBOInPlaceTest::InvertSSBOInPlaceTest (tcu::TestContext& testCtx,
1322 const std::string& name,
1323 const std::string& description,
1324 const deUint32 numValues,
1325 const bool sized,
1326 const tcu::IVec3& localSize,
1327 const tcu::IVec3& workSize)
1328 : TestCase (testCtx, name, description)
1329 , m_numValues (numValues)
1330 , m_sized (sized)
1331 , m_localSize (localSize)
1332 , m_workSize (workSize)
1333 {
1334 DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1335 }
1336
initPrograms(SourceCollections & sourceCollections) const1337 void InvertSSBOInPlaceTest::initPrograms (SourceCollections& sourceCollections) const
1338 {
1339 std::ostringstream src;
1340 src << "#version 310 es\n"
1341 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1342 << "layout(binding = 0) buffer InOut {\n"
1343 << " uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1344 << "} sb_inout;\n"
1345 << "void main (void) {\n"
1346 << " uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1347 << " uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
1348 << " uint groupNdx = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1349 << " uint offset = numValuesPerInv*groupNdx;\n"
1350 << "\n"
1351 << " for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1352 << " sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
1353 << "}\n";
1354
1355 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1356 }
1357
createInstance(Context & context) const1358 TestInstance* InvertSSBOInPlaceTest::createInstance (Context& context) const
1359 {
1360 return new InvertSSBOInPlaceTestInstance(context, m_numValues, m_localSize, m_workSize);
1361 }
1362
InvertSSBOInPlaceTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)1363 InvertSSBOInPlaceTestInstance::InvertSSBOInPlaceTestInstance (Context& context,
1364 const deUint32 numValues,
1365 const tcu::IVec3& localSize,
1366 const tcu::IVec3& workSize)
1367 : TestInstance (context)
1368 , m_numValues (numValues)
1369 , m_localSize (localSize)
1370 , m_workSize (workSize)
1371 {
1372 }
1373
iterate(void)1374 tcu::TestStatus InvertSSBOInPlaceTestInstance::iterate (void)
1375 {
1376 const DeviceInterface& vk = m_context.getDeviceInterface();
1377 const VkDevice device = m_context.getDevice();
1378 const VkQueue queue = m_context.getUniversalQueue();
1379 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
1380 Allocator& allocator = m_context.getDefaultAllocator();
1381
1382 // Create an input/output buffer
1383
1384 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
1385 const BufferWithMemory buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1386
1387 // Fill the buffer with data
1388
1389 typedef std::vector<deUint32> data_vector_t;
1390 data_vector_t inputData(m_numValues);
1391
1392 {
1393 de::Random rnd(0x82ce7f);
1394 const Allocation& bufferAllocation = buffer.getAllocation();
1395 deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
1396 for (deUint32 i = 0; i < m_numValues; ++i)
1397 inputData[i] = *bufferPtr++ = rnd.getUint32();
1398
1399 flushAlloc(vk, device, bufferAllocation);
1400 }
1401
1402 // Create descriptor set
1403
1404 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1405 DescriptorSetLayoutBuilder()
1406 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1407 .build(vk, device));
1408
1409 const Unique<VkDescriptorPool> descriptorPool(
1410 DescriptorPoolBuilder()
1411 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
1412 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1413
1414 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1415
1416 const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
1417 DescriptorSetUpdateBuilder()
1418 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
1419 .update(vk, device);
1420
1421 // Perform the computation
1422
1423 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
1424 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1425 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
1426
1427 const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer, 0ull, bufferSizeBytes);
1428
1429 const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
1430
1431 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1432 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1433
1434 // Start recording commands
1435
1436 beginCommandBuffer(vk, *cmdBuffer);
1437
1438 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1439 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1440
1441 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1442 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1443 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1444
1445 endCommandBuffer(vk, *cmdBuffer);
1446
1447 // Wait for completion
1448
1449 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1450
1451 // Validate the results
1452
1453 const Allocation& bufferAllocation = buffer.getAllocation();
1454 invalidateAlloc(vk, device, bufferAllocation);
1455
1456 const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
1457
1458 for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1459 {
1460 const deUint32 res = bufferPtr[ndx];
1461 const deUint32 ref = ~inputData[ndx];
1462
1463 if (res != ref)
1464 {
1465 std::ostringstream msg;
1466 msg << "Comparison failed for InOut.values[" << ndx << "]";
1467 return tcu::TestStatus::fail(msg.str());
1468 }
1469 }
1470 return tcu::TestStatus::pass("Compute succeeded");
1471 }
1472
1473 class WriteToMultipleSSBOTest : public vkt::TestCase
1474 {
1475 public:
1476 WriteToMultipleSSBOTest (tcu::TestContext& testCtx,
1477 const std::string& name,
1478 const std::string& description,
1479 const deUint32 numValues,
1480 const bool sized,
1481 const tcu::IVec3& localSize,
1482 const tcu::IVec3& workSize);
1483
1484 void initPrograms (SourceCollections& sourceCollections) const;
1485 TestInstance* createInstance (Context& context) const;
1486
1487 private:
1488 const deUint32 m_numValues;
1489 const bool m_sized;
1490 const tcu::IVec3 m_localSize;
1491 const tcu::IVec3 m_workSize;
1492 };
1493
1494 class WriteToMultipleSSBOTestInstance : public vkt::TestInstance
1495 {
1496 public:
1497 WriteToMultipleSSBOTestInstance (Context& context,
1498 const deUint32 numValues,
1499 const tcu::IVec3& localSize,
1500 const tcu::IVec3& workSize);
1501
1502 tcu::TestStatus iterate (void);
1503
1504 private:
1505 const deUint32 m_numValues;
1506 const tcu::IVec3 m_localSize;
1507 const tcu::IVec3 m_workSize;
1508 };
1509
WriteToMultipleSSBOTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const bool sized,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)1510 WriteToMultipleSSBOTest::WriteToMultipleSSBOTest (tcu::TestContext& testCtx,
1511 const std::string& name,
1512 const std::string& description,
1513 const deUint32 numValues,
1514 const bool sized,
1515 const tcu::IVec3& localSize,
1516 const tcu::IVec3& workSize)
1517 : TestCase (testCtx, name, description)
1518 , m_numValues (numValues)
1519 , m_sized (sized)
1520 , m_localSize (localSize)
1521 , m_workSize (workSize)
1522 {
1523 DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1524 }
1525
initPrograms(SourceCollections & sourceCollections) const1526 void WriteToMultipleSSBOTest::initPrograms (SourceCollections& sourceCollections) const
1527 {
1528 std::ostringstream src;
1529 src << "#version 310 es\n"
1530 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1531 << "layout(binding = 0) writeonly buffer Out0 {\n"
1532 << " uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1533 << "} sb_out0;\n"
1534 << "layout(binding = 1) writeonly buffer Out1 {\n"
1535 << " uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1536 << "} sb_out1;\n"
1537 << "void main (void) {\n"
1538 << " uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1539 << " uint groupNdx = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1540 << "\n"
1541 << " {\n"
1542 << " uint numValuesPerInv = uint(sb_out0.values.length()) / (size.x*size.y*size.z);\n"
1543 << " uint offset = numValuesPerInv*groupNdx;\n"
1544 << "\n"
1545 << " for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1546 << " sb_out0.values[offset + ndx] = offset + ndx;\n"
1547 << " }\n"
1548 << " {\n"
1549 << " uint numValuesPerInv = uint(sb_out1.values.length()) / (size.x*size.y*size.z);\n"
1550 << " uint offset = numValuesPerInv*groupNdx;\n"
1551 << "\n"
1552 << " for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1553 << " sb_out1.values[offset + ndx] = uint(sb_out1.values.length()) - offset - ndx;\n"
1554 << " }\n"
1555 << "}\n";
1556
1557 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1558 }
1559
createInstance(Context & context) const1560 TestInstance* WriteToMultipleSSBOTest::createInstance (Context& context) const
1561 {
1562 return new WriteToMultipleSSBOTestInstance(context, m_numValues, m_localSize, m_workSize);
1563 }
1564
WriteToMultipleSSBOTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)1565 WriteToMultipleSSBOTestInstance::WriteToMultipleSSBOTestInstance (Context& context,
1566 const deUint32 numValues,
1567 const tcu::IVec3& localSize,
1568 const tcu::IVec3& workSize)
1569 : TestInstance (context)
1570 , m_numValues (numValues)
1571 , m_localSize (localSize)
1572 , m_workSize (workSize)
1573 {
1574 }
1575
iterate(void)1576 tcu::TestStatus WriteToMultipleSSBOTestInstance::iterate (void)
1577 {
1578 const DeviceInterface& vk = m_context.getDeviceInterface();
1579 const VkDevice device = m_context.getDevice();
1580 const VkQueue queue = m_context.getUniversalQueue();
1581 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
1582 Allocator& allocator = m_context.getDefaultAllocator();
1583
1584 // Create two output buffers
1585
1586 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
1587 const BufferWithMemory buffer0(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1588 const BufferWithMemory buffer1(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1589
1590 // Create descriptor set
1591
1592 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1593 DescriptorSetLayoutBuilder()
1594 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1595 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1596 .build(vk, device));
1597
1598 const Unique<VkDescriptorPool> descriptorPool(
1599 DescriptorPoolBuilder()
1600 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2u)
1601 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1602
1603 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1604
1605 const VkDescriptorBufferInfo buffer0DescriptorInfo = makeDescriptorBufferInfo(*buffer0, 0ull, bufferSizeBytes);
1606 const VkDescriptorBufferInfo buffer1DescriptorInfo = makeDescriptorBufferInfo(*buffer1, 0ull, bufferSizeBytes);
1607 DescriptorSetUpdateBuilder()
1608 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &buffer0DescriptorInfo)
1609 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &buffer1DescriptorInfo)
1610 .update(vk, device);
1611
1612 // Perform the computation
1613
1614 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
1615 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1616 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
1617
1618 const VkBufferMemoryBarrier shaderWriteBarriers[] =
1619 {
1620 makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer0, 0ull, bufferSizeBytes),
1621 makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer1, 0ull, bufferSizeBytes)
1622 };
1623
1624 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1625 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1626
1627 // Start recording commands
1628
1629 beginCommandBuffer(vk, *cmdBuffer);
1630
1631 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1632 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1633
1634 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1635 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, DE_LENGTH_OF_ARRAY(shaderWriteBarriers), shaderWriteBarriers, 0, (const VkImageMemoryBarrier*)DE_NULL);
1636
1637 endCommandBuffer(vk, *cmdBuffer);
1638
1639 // Wait for completion
1640
1641 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1642
1643 // Validate the results
1644 {
1645 const Allocation& buffer0Allocation = buffer0.getAllocation();
1646 invalidateAlloc(vk, device, buffer0Allocation);
1647 const deUint32* buffer0Ptr = static_cast<deUint32*>(buffer0Allocation.getHostPtr());
1648
1649 for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1650 {
1651 const deUint32 res = buffer0Ptr[ndx];
1652 const deUint32 ref = ndx;
1653
1654 if (res != ref)
1655 {
1656 std::ostringstream msg;
1657 msg << "Comparison failed for Out0.values[" << ndx << "] res=" << res << " ref=" << ref;
1658 return tcu::TestStatus::fail(msg.str());
1659 }
1660 }
1661 }
1662 {
1663 const Allocation& buffer1Allocation = buffer1.getAllocation();
1664 invalidateAlloc(vk, device, buffer1Allocation);
1665 const deUint32* buffer1Ptr = static_cast<deUint32*>(buffer1Allocation.getHostPtr());
1666
1667 for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1668 {
1669 const deUint32 res = buffer1Ptr[ndx];
1670 const deUint32 ref = m_numValues - ndx;
1671
1672 if (res != ref)
1673 {
1674 std::ostringstream msg;
1675 msg << "Comparison failed for Out1.values[" << ndx << "] res=" << res << " ref=" << ref;
1676 return tcu::TestStatus::fail(msg.str());
1677 }
1678 }
1679 }
1680 return tcu::TestStatus::pass("Compute succeeded");
1681 }
1682
1683 class SSBOBarrierTest : public vkt::TestCase
1684 {
1685 public:
1686 SSBOBarrierTest (tcu::TestContext& testCtx,
1687 const std::string& name,
1688 const std::string& description,
1689 const tcu::IVec3& workSize);
1690
1691 void initPrograms (SourceCollections& sourceCollections) const;
1692 TestInstance* createInstance (Context& context) const;
1693
1694 private:
1695 const tcu::IVec3 m_workSize;
1696 };
1697
1698 class SSBOBarrierTestInstance : public vkt::TestInstance
1699 {
1700 public:
1701 SSBOBarrierTestInstance (Context& context,
1702 const tcu::IVec3& workSize);
1703
1704 tcu::TestStatus iterate (void);
1705
1706 private:
1707 const tcu::IVec3 m_workSize;
1708 };
1709
SSBOBarrierTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec3 & workSize)1710 SSBOBarrierTest::SSBOBarrierTest (tcu::TestContext& testCtx,
1711 const std::string& name,
1712 const std::string& description,
1713 const tcu::IVec3& workSize)
1714 : TestCase (testCtx, name, description)
1715 , m_workSize (workSize)
1716 {
1717 }
1718
initPrograms(SourceCollections & sourceCollections) const1719 void SSBOBarrierTest::initPrograms (SourceCollections& sourceCollections) const
1720 {
1721 sourceCollections.glslSources.add("comp0") << glu::ComputeSource(
1722 "#version 310 es\n"
1723 "layout (local_size_x = 1) in;\n"
1724 "layout(binding = 2) readonly uniform Constants {\n"
1725 " uint u_baseVal;\n"
1726 "};\n"
1727 "layout(binding = 1) writeonly buffer Output {\n"
1728 " uint values[];\n"
1729 "};\n"
1730 "void main (void) {\n"
1731 " uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
1732 " values[offset] = u_baseVal + offset;\n"
1733 "}\n");
1734
1735 sourceCollections.glslSources.add("comp1") << glu::ComputeSource(
1736 "#version 310 es\n"
1737 "layout (local_size_x = 1) in;\n"
1738 "layout(binding = 1) readonly buffer Input {\n"
1739 " uint values[];\n"
1740 "};\n"
1741 "layout(binding = 0) coherent buffer Output {\n"
1742 " uint sum;\n"
1743 "};\n"
1744 "void main (void) {\n"
1745 " uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
1746 " uint value = values[offset];\n"
1747 " atomicAdd(sum, value);\n"
1748 "}\n");
1749 }
1750
createInstance(Context & context) const1751 TestInstance* SSBOBarrierTest::createInstance (Context& context) const
1752 {
1753 return new SSBOBarrierTestInstance(context, m_workSize);
1754 }
1755
SSBOBarrierTestInstance(Context & context,const tcu::IVec3 & workSize)1756 SSBOBarrierTestInstance::SSBOBarrierTestInstance (Context& context, const tcu::IVec3& workSize)
1757 : TestInstance (context)
1758 , m_workSize (workSize)
1759 {
1760 }
1761
iterate(void)1762 tcu::TestStatus SSBOBarrierTestInstance::iterate (void)
1763 {
1764 const DeviceInterface& vk = m_context.getDeviceInterface();
1765 const VkDevice device = m_context.getDevice();
1766 const VkQueue queue = m_context.getUniversalQueue();
1767 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
1768 Allocator& allocator = m_context.getDefaultAllocator();
1769
1770 // Create a work buffer used by both shaders
1771
1772 const int workGroupCount = multiplyComponents(m_workSize);
1773 const VkDeviceSize workBufferSizeBytes = sizeof(deUint32) * workGroupCount;
1774 const BufferWithMemory workBuffer(vk, device, allocator, makeBufferCreateInfo(workBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::Any);
1775
1776 // Create an output buffer
1777
1778 const VkDeviceSize outputBufferSizeBytes = sizeof(deUint32);
1779 const BufferWithMemory outputBuffer(vk, device, allocator, makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1780
1781 // Initialize atomic counter value to zero
1782 {
1783 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1784 deUint32* outputBufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
1785 *outputBufferPtr = 0;
1786 flushAlloc(vk, device, outputBufferAllocation);
1787 }
1788
1789 // Create a uniform buffer (to pass uniform constants)
1790
1791 const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32);
1792 const BufferWithMemory uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
1793
1794 // Set the constants in the uniform buffer
1795
1796 const deUint32 baseValue = 127;
1797 {
1798 const Allocation& uniformBufferAllocation = uniformBuffer.getAllocation();
1799 deUint32* uniformBufferPtr = static_cast<deUint32*>(uniformBufferAllocation.getHostPtr());
1800 uniformBufferPtr[0] = baseValue;
1801
1802 flushAlloc(vk, device, uniformBufferAllocation);
1803 }
1804
1805 // Create descriptor set
1806
1807 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1808 DescriptorSetLayoutBuilder()
1809 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1810 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1811 .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1812 .build(vk, device));
1813
1814 const Unique<VkDescriptorPool> descriptorPool(
1815 DescriptorPoolBuilder()
1816 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2u)
1817 .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
1818 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1819
1820 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1821
1822 const VkDescriptorBufferInfo workBufferDescriptorInfo = makeDescriptorBufferInfo(*workBuffer, 0ull, workBufferSizeBytes);
1823 const VkDescriptorBufferInfo outputBufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, outputBufferSizeBytes);
1824 const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
1825 DescriptorSetUpdateBuilder()
1826 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
1827 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &workBufferDescriptorInfo)
1828 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(2u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
1829 .update(vk, device);
1830
1831 // Perform the computation
1832
1833 const Unique<VkShaderModule> shaderModule0(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp0"), 0));
1834 const Unique<VkShaderModule> shaderModule1(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp1"), 0));
1835
1836 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1837 const Unique<VkPipeline> pipeline0(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule0));
1838 const Unique<VkPipeline> pipeline1(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule1));
1839
1840 const VkBufferMemoryBarrier writeUniformConstantsBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
1841
1842 const VkBufferMemoryBarrier betweenShadersBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *workBuffer, 0ull, workBufferSizeBytes);
1843
1844 const VkBufferMemoryBarrier afterComputeBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, outputBufferSizeBytes);
1845
1846 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1847 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1848
1849 // Start recording commands
1850
1851 beginCommandBuffer(vk, *cmdBuffer);
1852
1853 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline0);
1854 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1855
1856 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &writeUniformConstantsBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1857
1858 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1859 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &betweenShadersBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1860
1861 // Switch to the second shader program
1862 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline1);
1863
1864 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1865 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &afterComputeBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1866
1867 endCommandBuffer(vk, *cmdBuffer);
1868
1869 // Wait for completion
1870
1871 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1872
1873 // Validate the results
1874
1875 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1876 invalidateAlloc(vk, device, outputBufferAllocation);
1877
1878 const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
1879 const deUint32 res = *bufferPtr;
1880 deUint32 ref = 0;
1881
1882 for (int ndx = 0; ndx < workGroupCount; ++ndx)
1883 ref += baseValue + ndx;
1884
1885 if (res != ref)
1886 {
1887 std::ostringstream msg;
1888 msg << "ERROR: comparison failed, expected " << ref << ", got " << res;
1889 return tcu::TestStatus::fail(msg.str());
1890 }
1891 return tcu::TestStatus::pass("Compute succeeded");
1892 }
1893
1894 class ImageAtomicOpTest : public vkt::TestCase
1895 {
1896 public:
1897 ImageAtomicOpTest (tcu::TestContext& testCtx,
1898 const std::string& name,
1899 const std::string& description,
1900 const deUint32 localSize,
1901 const tcu::IVec2& imageSize);
1902
1903 void initPrograms (SourceCollections& sourceCollections) const;
1904 TestInstance* createInstance (Context& context) const;
1905
1906 private:
1907 const deUint32 m_localSize;
1908 const tcu::IVec2 m_imageSize;
1909 };
1910
1911 class ImageAtomicOpTestInstance : public vkt::TestInstance
1912 {
1913 public:
1914 ImageAtomicOpTestInstance (Context& context,
1915 const deUint32 localSize,
1916 const tcu::IVec2& imageSize);
1917
1918 tcu::TestStatus iterate (void);
1919
1920 private:
1921 const deUint32 m_localSize;
1922 const tcu::IVec2 m_imageSize;
1923 };
1924
ImageAtomicOpTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 localSize,const tcu::IVec2 & imageSize)1925 ImageAtomicOpTest::ImageAtomicOpTest (tcu::TestContext& testCtx,
1926 const std::string& name,
1927 const std::string& description,
1928 const deUint32 localSize,
1929 const tcu::IVec2& imageSize)
1930 : TestCase (testCtx, name, description)
1931 , m_localSize (localSize)
1932 , m_imageSize (imageSize)
1933 {
1934 }
1935
initPrograms(SourceCollections & sourceCollections) const1936 void ImageAtomicOpTest::initPrograms (SourceCollections& sourceCollections) const
1937 {
1938 std::ostringstream src;
1939 src << "#version 310 es\n"
1940 << "#extension GL_OES_shader_image_atomic : require\n"
1941 << "layout (local_size_x = " << m_localSize << ") in;\n"
1942 << "layout(binding = 1, r32ui) coherent uniform highp uimage2D u_dstImg;\n"
1943 << "layout(binding = 0) readonly buffer Input {\n"
1944 << " uint values[" << (multiplyComponents(m_imageSize) * m_localSize) << "];\n"
1945 << "} sb_in;\n\n"
1946 << "void main (void) {\n"
1947 << " uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
1948 << " uint value = sb_in.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x];\n"
1949 << "\n"
1950 << " if (gl_LocalInvocationIndex == 0u)\n"
1951 << " imageStore(u_dstImg, ivec2(gl_WorkGroupID.xy), uvec4(0));\n"
1952 << " memoryBarrierImage();\n"
1953 << " barrier();\n"
1954 << " imageAtomicAdd(u_dstImg, ivec2(gl_WorkGroupID.xy), value);\n"
1955 << "}\n";
1956
1957 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1958 }
1959
createInstance(Context & context) const1960 TestInstance* ImageAtomicOpTest::createInstance (Context& context) const
1961 {
1962 return new ImageAtomicOpTestInstance(context, m_localSize, m_imageSize);
1963 }
1964
ImageAtomicOpTestInstance(Context & context,const deUint32 localSize,const tcu::IVec2 & imageSize)1965 ImageAtomicOpTestInstance::ImageAtomicOpTestInstance (Context& context, const deUint32 localSize, const tcu::IVec2& imageSize)
1966 : TestInstance (context)
1967 , m_localSize (localSize)
1968 , m_imageSize (imageSize)
1969 {
1970 }
1971
iterate(void)1972 tcu::TestStatus ImageAtomicOpTestInstance::iterate (void)
1973 {
1974 const DeviceInterface& vk = m_context.getDeviceInterface();
1975 const VkDevice device = m_context.getDevice();
1976 const VkQueue queue = m_context.getUniversalQueue();
1977 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
1978 Allocator& allocator = m_context.getDefaultAllocator();
1979
1980 // Create an image
1981
1982 const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
1983 const ImageWithMemory image(vk, device, allocator, imageParams, MemoryRequirement::Any);
1984
1985 const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
1986 const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
1987
1988 // Input buffer
1989
1990 const deUint32 numInputValues = multiplyComponents(m_imageSize) * m_localSize;
1991 const VkDeviceSize inputBufferSizeBytes = sizeof(deUint32) * numInputValues;
1992
1993 const BufferWithMemory inputBuffer(vk, device, allocator, makeBufferCreateInfo(inputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1994
1995 // Populate the input buffer with test data
1996 {
1997 de::Random rnd(0x77238ac2);
1998 const Allocation& inputBufferAllocation = inputBuffer.getAllocation();
1999 deUint32* bufferPtr = static_cast<deUint32*>(inputBufferAllocation.getHostPtr());
2000 for (deUint32 i = 0; i < numInputValues; ++i)
2001 *bufferPtr++ = rnd.getUint32();
2002
2003 flushAlloc(vk, device, inputBufferAllocation);
2004 }
2005
2006 // Create a buffer to store shader output (copied from image data)
2007
2008 const deUint32 imageArea = multiplyComponents(m_imageSize);
2009 const VkDeviceSize outputBufferSizeBytes = sizeof(deUint32) * imageArea;
2010 const BufferWithMemory outputBuffer(vk, device, allocator, makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT), MemoryRequirement::HostVisible);
2011
2012 // Create descriptor set
2013
2014 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2015 DescriptorSetLayoutBuilder()
2016 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2017 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
2018 .build(vk, device));
2019
2020 const Unique<VkDescriptorPool> descriptorPool(
2021 DescriptorPoolBuilder()
2022 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2023 .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
2024 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2025
2026 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2027
2028 // Set the bindings
2029
2030 const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
2031 const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, inputBufferSizeBytes);
2032
2033 DescriptorSetUpdateBuilder()
2034 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2035 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
2036 .update(vk, device);
2037
2038 // Perform the computation
2039 {
2040 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
2041 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2042 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
2043
2044 const VkBufferMemoryBarrier inputBufferPostHostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, inputBufferSizeBytes);
2045
2046 const VkImageMemoryBarrier imageLayoutBarrier = makeImageMemoryBarrier(
2047 (VkAccessFlags)0, VK_ACCESS_SHADER_WRITE_BIT,
2048 VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
2049 *image, subresourceRange);
2050
2051 // Prepare the command buffer
2052
2053 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
2054 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2055
2056 // Start recording commands
2057
2058 beginCommandBuffer(vk, *cmdBuffer);
2059
2060 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
2061 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2062
2063 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &inputBufferPostHostWriteBarrier, 1, &imageLayoutBarrier);
2064 vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2065
2066 copyImageToBuffer(vk, *cmdBuffer, *image, *outputBuffer, m_imageSize, VK_ACCESS_SHADER_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL);
2067
2068 endCommandBuffer(vk, *cmdBuffer);
2069
2070 // Wait for completion
2071
2072 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2073 }
2074
2075 // Validate the results
2076
2077 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
2078 invalidateAlloc(vk, device, outputBufferAllocation);
2079
2080 const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
2081 const deUint32* refBufferPtr = static_cast<deUint32*>(inputBuffer.getAllocation().getHostPtr());
2082
2083 for (deUint32 pixelNdx = 0; pixelNdx < imageArea; ++pixelNdx)
2084 {
2085 const deUint32 res = bufferPtr[pixelNdx];
2086 deUint32 ref = 0;
2087
2088 for (deUint32 offs = 0; offs < m_localSize; ++offs)
2089 ref += refBufferPtr[pixelNdx * m_localSize + offs];
2090
2091 if (res != ref)
2092 {
2093 std::ostringstream msg;
2094 msg << "Comparison failed for pixel " << pixelNdx;
2095 return tcu::TestStatus::fail(msg.str());
2096 }
2097 }
2098 return tcu::TestStatus::pass("Compute succeeded");
2099 }
2100
2101 class ImageBarrierTest : public vkt::TestCase
2102 {
2103 public:
2104 ImageBarrierTest (tcu::TestContext& testCtx,
2105 const std::string& name,
2106 const std::string& description,
2107 const tcu::IVec2& imageSize);
2108
2109 void initPrograms (SourceCollections& sourceCollections) const;
2110 TestInstance* createInstance (Context& context) const;
2111
2112 private:
2113 const tcu::IVec2 m_imageSize;
2114 };
2115
2116 class ImageBarrierTestInstance : public vkt::TestInstance
2117 {
2118 public:
2119 ImageBarrierTestInstance (Context& context,
2120 const tcu::IVec2& imageSize);
2121
2122 tcu::TestStatus iterate (void);
2123
2124 private:
2125 const tcu::IVec2 m_imageSize;
2126 };
2127
ImageBarrierTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec2 & imageSize)2128 ImageBarrierTest::ImageBarrierTest (tcu::TestContext& testCtx,
2129 const std::string& name,
2130 const std::string& description,
2131 const tcu::IVec2& imageSize)
2132 : TestCase (testCtx, name, description)
2133 , m_imageSize (imageSize)
2134 {
2135 }
2136
initPrograms(SourceCollections & sourceCollections) const2137 void ImageBarrierTest::initPrograms (SourceCollections& sourceCollections) const
2138 {
2139 sourceCollections.glslSources.add("comp0") << glu::ComputeSource(
2140 "#version 310 es\n"
2141 "layout (local_size_x = 1) in;\n"
2142 "layout(binding = 2) readonly uniform Constants {\n"
2143 " uint u_baseVal;\n"
2144 "};\n"
2145 "layout(binding = 1, r32ui) writeonly uniform highp uimage2D u_img;\n"
2146 "void main (void) {\n"
2147 " uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
2148 " imageStore(u_img, ivec2(gl_WorkGroupID.xy), uvec4(offset + u_baseVal, 0, 0, 0));\n"
2149 "}\n");
2150
2151 sourceCollections.glslSources.add("comp1") << glu::ComputeSource(
2152 "#version 310 es\n"
2153 "layout (local_size_x = 1) in;\n"
2154 "layout(binding = 1, r32ui) readonly uniform highp uimage2D u_img;\n"
2155 "layout(binding = 0) coherent buffer Output {\n"
2156 " uint sum;\n"
2157 "};\n"
2158 "void main (void) {\n"
2159 " uint value = imageLoad(u_img, ivec2(gl_WorkGroupID.xy)).x;\n"
2160 " atomicAdd(sum, value);\n"
2161 "}\n");
2162 }
2163
createInstance(Context & context) const2164 TestInstance* ImageBarrierTest::createInstance (Context& context) const
2165 {
2166 return new ImageBarrierTestInstance(context, m_imageSize);
2167 }
2168
ImageBarrierTestInstance(Context & context,const tcu::IVec2 & imageSize)2169 ImageBarrierTestInstance::ImageBarrierTestInstance (Context& context, const tcu::IVec2& imageSize)
2170 : TestInstance (context)
2171 , m_imageSize (imageSize)
2172 {
2173 }
2174
iterate(void)2175 tcu::TestStatus ImageBarrierTestInstance::iterate (void)
2176 {
2177 const DeviceInterface& vk = m_context.getDeviceInterface();
2178 const VkDevice device = m_context.getDevice();
2179 const VkQueue queue = m_context.getUniversalQueue();
2180 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
2181 Allocator& allocator = m_context.getDefaultAllocator();
2182
2183 // Create an image used by both shaders
2184
2185 const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_STORAGE_BIT);
2186 const ImageWithMemory image(vk, device, allocator, imageParams, MemoryRequirement::Any);
2187
2188 const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
2189 const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
2190
2191 // Create an output buffer
2192
2193 const VkDeviceSize outputBufferSizeBytes = sizeof(deUint32);
2194 const BufferWithMemory outputBuffer(vk, device, allocator, makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
2195
2196 // Initialize atomic counter value to zero
2197 {
2198 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
2199 deUint32* outputBufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
2200 *outputBufferPtr = 0;
2201 flushAlloc(vk, device, outputBufferAllocation);
2202 }
2203
2204 // Create a uniform buffer (to pass uniform constants)
2205
2206 const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32);
2207 const BufferWithMemory uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
2208
2209 // Set the constants in the uniform buffer
2210
2211 const deUint32 baseValue = 127;
2212 {
2213 const Allocation& uniformBufferAllocation = uniformBuffer.getAllocation();
2214 deUint32* uniformBufferPtr = static_cast<deUint32*>(uniformBufferAllocation.getHostPtr());
2215 uniformBufferPtr[0] = baseValue;
2216
2217 flushAlloc(vk, device, uniformBufferAllocation);
2218 }
2219
2220 // Create descriptor set
2221
2222 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2223 DescriptorSetLayoutBuilder()
2224 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2225 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
2226 .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2227 .build(vk, device));
2228
2229 const Unique<VkDescriptorPool> descriptorPool(
2230 DescriptorPoolBuilder()
2231 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2232 .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
2233 .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2234 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2235
2236 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2237
2238 const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
2239 const VkDescriptorBufferInfo outputBufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, outputBufferSizeBytes);
2240 const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2241 DescriptorSetUpdateBuilder()
2242 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
2243 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
2244 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(2u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2245 .update(vk, device);
2246
2247 // Perform the computation
2248
2249 const Unique<VkShaderModule> shaderModule0(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp0"), 0));
2250 const Unique<VkShaderModule> shaderModule1(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp1"), 0));
2251
2252 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2253 const Unique<VkPipeline> pipeline0(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule0));
2254 const Unique<VkPipeline> pipeline1(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule1));
2255
2256 const VkBufferMemoryBarrier writeUniformConstantsBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2257
2258 const VkImageMemoryBarrier imageLayoutBarrier = makeImageMemoryBarrier(
2259 0u, 0u,
2260 VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
2261 *image, subresourceRange);
2262
2263 const VkImageMemoryBarrier imageBarrierBetweenShaders = makeImageMemoryBarrier(
2264 VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT,
2265 VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_GENERAL,
2266 *image, subresourceRange);
2267
2268 const VkBufferMemoryBarrier afterComputeBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, outputBufferSizeBytes);
2269
2270 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
2271 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2272
2273 // Start recording commands
2274
2275 beginCommandBuffer(vk, *cmdBuffer);
2276
2277 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline0);
2278 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2279
2280 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &writeUniformConstantsBarrier, 1, &imageLayoutBarrier);
2281
2282 vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2283 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 0, (const VkBufferMemoryBarrier*)DE_NULL, 1, &imageBarrierBetweenShaders);
2284
2285 // Switch to the second shader program
2286 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline1);
2287
2288 vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2289 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &afterComputeBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2290
2291 endCommandBuffer(vk, *cmdBuffer);
2292
2293 // Wait for completion
2294
2295 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2296
2297 // Validate the results
2298
2299 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
2300 invalidateAlloc(vk, device, outputBufferAllocation);
2301
2302 const int numValues = multiplyComponents(m_imageSize);
2303 const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
2304 const deUint32 res = *bufferPtr;
2305 deUint32 ref = 0;
2306
2307 for (int ndx = 0; ndx < numValues; ++ndx)
2308 ref += baseValue + ndx;
2309
2310 if (res != ref)
2311 {
2312 std::ostringstream msg;
2313 msg << "ERROR: comparison failed, expected " << ref << ", got " << res;
2314 return tcu::TestStatus::fail(msg.str());
2315 }
2316 return tcu::TestStatus::pass("Compute succeeded");
2317 }
2318
2319 class ComputeTestInstance : public vkt::TestInstance
2320 {
2321 public:
ComputeTestInstance(Context & context)2322 ComputeTestInstance (Context& context)
2323 : TestInstance (context)
2324 , m_numPhysDevices (1)
2325 , m_queueFamilyIndex (0)
2326 {
2327 createDeviceGroup();
2328 }
2329
~ComputeTestInstance()2330 ~ComputeTestInstance ()
2331 {
2332 }
2333
2334 void createDeviceGroup (void);
getDeviceInterface(void)2335 const vk::DeviceInterface& getDeviceInterface (void) { return *m_deviceDriver; }
getInstance(void)2336 vk::VkInstance getInstance (void) { return m_deviceGroupInstance; }
getDevice(void)2337 vk::VkDevice getDevice (void) { return *m_logicalDevice; }
getPhysicalDevice(deUint32 i=0)2338 vk::VkPhysicalDevice getPhysicalDevice (deUint32 i = 0){ return m_physicalDevices[i]; }
2339
2340 protected:
2341 deUint32 m_numPhysDevices;
2342 deUint32 m_queueFamilyIndex;
2343
2344 private:
2345 CustomInstance m_deviceGroupInstance;
2346 vk::Move<vk::VkDevice> m_logicalDevice;
2347 std::vector<vk::VkPhysicalDevice> m_physicalDevices;
2348 #ifndef CTS_USES_VULKANSC
2349 de::MovePtr<vk::DeviceDriver> m_deviceDriver;
2350 #else
2351 de::MovePtr<vk::DeviceDriverSC, vk::DeinitDeviceDeleter> m_deviceDriver;
2352 #endif // CTS_USES_VULKANSC
2353 };
2354
createDeviceGroup(void)2355 void ComputeTestInstance::createDeviceGroup (void)
2356 {
2357 const tcu::CommandLine& cmdLine = m_context.getTestContext().getCommandLine();
2358 const deUint32 devGroupIdx = cmdLine.getVKDeviceGroupId() - 1;
2359 const deUint32 physDeviceIdx = cmdLine.getVKDeviceId() - 1;
2360 const float queuePriority = 1.0f;
2361 const std::vector<std::string> requiredExtensions (1, "VK_KHR_device_group_creation");
2362 m_deviceGroupInstance = createCustomInstanceWithExtensions(m_context, requiredExtensions);
2363 std::vector<VkPhysicalDeviceGroupProperties> devGroupProperties = enumeratePhysicalDeviceGroups(m_context.getInstanceInterface(), m_deviceGroupInstance);
2364 m_numPhysDevices = devGroupProperties[devGroupIdx].physicalDeviceCount;
2365 std::vector<const char*> deviceExtensions;
2366
2367 if (!isCoreDeviceExtension(m_context.getUsedApiVersion(), "VK_KHR_device_group"))
2368 deviceExtensions.push_back("VK_KHR_device_group");
2369
2370 VkDeviceGroupDeviceCreateInfo deviceGroupInfo =
2371 {
2372 VK_STRUCTURE_TYPE_DEVICE_GROUP_DEVICE_CREATE_INFO, //stype
2373 DE_NULL, //pNext
2374 devGroupProperties[devGroupIdx].physicalDeviceCount, //physicalDeviceCount
2375 devGroupProperties[devGroupIdx].physicalDevices //physicalDevices
2376 };
2377 const InstanceDriver& instance (m_deviceGroupInstance.getDriver());
2378 const VkPhysicalDeviceFeatures deviceFeatures = getPhysicalDeviceFeatures(instance, deviceGroupInfo.pPhysicalDevices[physDeviceIdx]);
2379 const std::vector<VkQueueFamilyProperties> queueProps = getPhysicalDeviceQueueFamilyProperties(instance, devGroupProperties[devGroupIdx].physicalDevices[physDeviceIdx]);
2380
2381 m_physicalDevices.resize(m_numPhysDevices);
2382 for (deUint32 physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2383 m_physicalDevices[physDevIdx] = devGroupProperties[devGroupIdx].physicalDevices[physDevIdx];
2384
2385 for (size_t queueNdx = 0; queueNdx < queueProps.size(); queueNdx++)
2386 {
2387 if (queueProps[queueNdx].queueFlags & VK_QUEUE_COMPUTE_BIT)
2388 m_queueFamilyIndex = (deUint32)queueNdx;
2389 }
2390
2391 VkDeviceQueueCreateInfo queueInfo =
2392 {
2393 VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, // VkStructureType sType;
2394 DE_NULL, // const void* pNext;
2395 (VkDeviceQueueCreateFlags)0u, // VkDeviceQueueCreateFlags flags;
2396 m_queueFamilyIndex, // deUint32 queueFamilyIndex;
2397 1u, // deUint32 queueCount;
2398 &queuePriority // const float* pQueuePriorities;
2399 };
2400
2401 void* pNext = &deviceGroupInfo;
2402 #ifdef CTS_USES_VULKANSC
2403 VkDeviceObjectReservationCreateInfo memReservationInfo = cmdLine.isSubProcess() ? m_context.getResourceInterface()->getStatMax() : resetDeviceObjectReservationCreateInfo();
2404 memReservationInfo.pNext = pNext;
2405 pNext = &memReservationInfo;
2406
2407 VkPhysicalDeviceVulkanSC10Features sc10Features = createDefaultSC10Features();
2408 sc10Features.pNext = pNext;
2409 pNext = &sc10Features;
2410 VkPipelineCacheCreateInfo pcCI;
2411 std::vector<VkPipelinePoolSize> poolSizes;
2412 if (cmdLine.isSubProcess())
2413 {
2414 if (m_context.getResourceInterface()->getCacheDataSize() > 0)
2415 {
2416 pcCI =
2417 {
2418 VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO, // VkStructureType sType;
2419 DE_NULL, // const void* pNext;
2420 VK_PIPELINE_CACHE_CREATE_READ_ONLY_BIT |
2421 VK_PIPELINE_CACHE_CREATE_USE_APPLICATION_STORAGE_BIT, // VkPipelineCacheCreateFlags flags;
2422 m_context.getResourceInterface()->getCacheDataSize(), // deUintptr initialDataSize;
2423 m_context.getResourceInterface()->getCacheData() // const void* pInitialData;
2424 };
2425 memReservationInfo.pipelineCacheCreateInfoCount = 1;
2426 memReservationInfo.pPipelineCacheCreateInfos = &pcCI;
2427 }
2428
2429 poolSizes = m_context.getResourceInterface()->getPipelinePoolSizes();
2430 if (!poolSizes.empty())
2431 {
2432 memReservationInfo.pipelinePoolSizeCount = deUint32(poolSizes.size());
2433 memReservationInfo.pPipelinePoolSizes = poolSizes.data();
2434 }
2435 }
2436
2437 #endif // CTS_USES_VULKANSC
2438
2439 const VkDeviceCreateInfo deviceInfo =
2440 {
2441 VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, // VkStructureType sType;
2442 pNext, // const void* pNext;
2443 (VkDeviceCreateFlags)0, // VkDeviceCreateFlags flags;
2444 1u , // uint32_t queueCreateInfoCount;
2445 &queueInfo, // const VkDeviceQueueCreateInfo* pQueueCreateInfos;
2446 0u, // uint32_t enabledLayerCount;
2447 DE_NULL, // const char* const* ppEnabledLayerNames;
2448 deUint32(deviceExtensions.size()), // uint32_t enabledExtensionCount;
2449 (deviceExtensions.empty() ? DE_NULL : &deviceExtensions[0]), // const char* const* ppEnabledExtensionNames;
2450 &deviceFeatures, // const VkPhysicalDeviceFeatures* pEnabledFeatures;
2451 };
2452
2453 m_logicalDevice = createCustomDevice(m_context.getTestContext().getCommandLine().isValidationEnabled(), m_context.getPlatformInterface(), m_deviceGroupInstance, instance, deviceGroupInfo.pPhysicalDevices[physDeviceIdx], &deviceInfo);
2454 #ifndef CTS_USES_VULKANSC
2455 m_deviceDriver = de::MovePtr<DeviceDriver>(new DeviceDriver(m_context.getPlatformInterface(), m_deviceGroupInstance, *m_logicalDevice));
2456 #else
2457 m_deviceDriver = de::MovePtr<DeviceDriverSC, DeinitDeviceDeleter>(new DeviceDriverSC(m_context.getPlatformInterface(), m_context.getInstance(), *m_logicalDevice, m_context.getTestContext().getCommandLine(), m_context.getResourceInterface(), m_context.getDeviceVulkanSC10Properties(), m_context.getDeviceProperties()), vk::DeinitDeviceDeleter(m_context.getResourceInterface().get(), *m_logicalDevice));
2458 #endif // CTS_USES_VULKANSC
2459 }
2460
2461 class DispatchBaseTest : public vkt::TestCase
2462 {
2463 public:
2464 DispatchBaseTest (tcu::TestContext& testCtx,
2465 const std::string& name,
2466 const std::string& description,
2467 const deUint32 numValues,
2468 const tcu::IVec3& localsize,
2469 const tcu::IVec3& worksize,
2470 const tcu::IVec3& splitsize);
2471
2472 void initPrograms (SourceCollections& sourceCollections) const;
2473 TestInstance* createInstance (Context& context) const;
2474
2475 private:
2476 const deUint32 m_numValues;
2477 const tcu::IVec3 m_localSize;
2478 const tcu::IVec3 m_workSize;
2479 const tcu::IVec3 m_splitSize;
2480 };
2481
2482 class DispatchBaseTestInstance : public ComputeTestInstance
2483 {
2484 public:
2485 DispatchBaseTestInstance (Context& context,
2486 const deUint32 numValues,
2487 const tcu::IVec3& localsize,
2488 const tcu::IVec3& worksize,
2489 const tcu::IVec3& splitsize);
2490
2491 bool isInputVectorValid (const tcu::IVec3& small, const tcu::IVec3& big);
2492 tcu::TestStatus iterate (void);
2493
2494 private:
2495 const deUint32 m_numValues;
2496 const tcu::IVec3 m_localSize;
2497 const tcu::IVec3 m_workSize;
2498 const tcu::IVec3 m_splitWorkSize;
2499 };
2500
DispatchBaseTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize,const tcu::IVec3 & splitsize)2501 DispatchBaseTest::DispatchBaseTest (tcu::TestContext& testCtx,
2502 const std::string& name,
2503 const std::string& description,
2504 const deUint32 numValues,
2505 const tcu::IVec3& localsize,
2506 const tcu::IVec3& worksize,
2507 const tcu::IVec3& splitsize)
2508 : TestCase (testCtx, name, description)
2509 , m_numValues (numValues)
2510 , m_localSize (localsize)
2511 , m_workSize (worksize)
2512 , m_splitSize (splitsize)
2513 {
2514 }
2515
initPrograms(SourceCollections & sourceCollections) const2516 void DispatchBaseTest::initPrograms (SourceCollections& sourceCollections) const
2517 {
2518 std::ostringstream src;
2519 src << "#version 310 es\n"
2520 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
2521
2522 << "layout(binding = 0) buffer InOut {\n"
2523 << " uint values[" << de::toString(m_numValues) << "];\n"
2524 << "} sb_inout;\n"
2525
2526 << "layout(binding = 1) readonly uniform uniformInput {\n"
2527 << " uvec3 gridSize;\n"
2528 << "} ubo_in;\n"
2529
2530 << "void main (void) {\n"
2531 << " uvec3 size = ubo_in.gridSize * gl_WorkGroupSize;\n"
2532 << " uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
2533 << " uint index = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
2534 << " uint offset = numValuesPerInv*index;\n"
2535 << " for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
2536 << " sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
2537 << "}\n";
2538
2539 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
2540 }
2541
createInstance(Context & context) const2542 TestInstance* DispatchBaseTest::createInstance (Context& context) const
2543 {
2544 return new DispatchBaseTestInstance(context, m_numValues, m_localSize, m_workSize, m_splitSize);
2545 }
2546
DispatchBaseTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize,const tcu::IVec3 & splitsize)2547 DispatchBaseTestInstance::DispatchBaseTestInstance (Context& context,
2548 const deUint32 numValues,
2549 const tcu::IVec3& localsize,
2550 const tcu::IVec3& worksize,
2551 const tcu::IVec3& splitsize)
2552
2553 : ComputeTestInstance (context)
2554 , m_numValues (numValues)
2555 , m_localSize (localsize)
2556 , m_workSize (worksize)
2557 , m_splitWorkSize (splitsize)
2558 {
2559 // For easy work distribution across physical devices:
2560 // WorkSize should be a multiple of SplitWorkSize only in the X component
2561 if ((!isInputVectorValid(m_splitWorkSize, m_workSize)) ||
2562 (m_workSize.x() <= m_splitWorkSize.x()) ||
2563 (m_workSize.y() != m_splitWorkSize.y()) ||
2564 (m_workSize.z() != m_splitWorkSize.z()))
2565 TCU_THROW(TestError, "Invalid Input.");
2566
2567 // For easy work distribution within the same physical device:
2568 // SplitWorkSize should be a multiple of localSize in Y or Z component
2569 if ((!isInputVectorValid(m_localSize, m_splitWorkSize)) ||
2570 (m_localSize.x() != m_splitWorkSize.x()) ||
2571 (m_localSize.y() >= m_splitWorkSize.y()) ||
2572 (m_localSize.z() >= m_splitWorkSize.z()))
2573 TCU_THROW(TestError, "Invalid Input.");
2574
2575 if ((multiplyComponents(m_workSize) / multiplyComponents(m_splitWorkSize)) < (deInt32) m_numPhysDevices)
2576 TCU_THROW(TestError, "Not enough work to distribute across all physical devices.");
2577
2578 deUint32 totalWork = multiplyComponents(m_workSize) * multiplyComponents(m_localSize);
2579 if ((totalWork > numValues) || (numValues % totalWork != 0))
2580 TCU_THROW(TestError, "Buffer too small/not aligned to cover all values.");
2581 }
2582
isInputVectorValid(const tcu::IVec3 & small,const tcu::IVec3 & big)2583 bool DispatchBaseTestInstance::isInputVectorValid(const tcu::IVec3& small, const tcu::IVec3& big)
2584 {
2585 if (((big.x() < small.x()) || (big.y() < small.y()) || (big.z() < small.z())) ||
2586 ((big.x() % small.x() != 0) || (big.y() % small.y() != 0) || (big.z() % small.z() != 0)))
2587 return false;
2588 return true;
2589 }
2590
iterate(void)2591 tcu::TestStatus DispatchBaseTestInstance::iterate (void)
2592 {
2593 const DeviceInterface& vk = getDeviceInterface();
2594 const VkDevice device = getDevice();
2595 const VkQueue queue = getDeviceQueue(vk, device, m_queueFamilyIndex, 0);
2596 SimpleAllocator allocator (vk, device, getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice()));
2597 deUint32 totalWorkloadSize = 0;
2598
2599 // Create an uniform and input/output buffer
2600 const deUint32 uniformBufSize = 3; // Pass the compute grid size
2601 const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32) * uniformBufSize;
2602 const BufferWithMemory uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
2603
2604 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
2605 const BufferWithMemory buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
2606
2607 // Fill the buffers with data
2608 typedef std::vector<deUint32> data_vector_t;
2609 data_vector_t uniformInputData(uniformBufSize);
2610 data_vector_t inputData(m_numValues);
2611
2612 {
2613 const Allocation& bufferAllocation = uniformBuffer.getAllocation();
2614 deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2615 uniformInputData[0] = *bufferPtr++ = m_workSize.x();
2616 uniformInputData[1] = *bufferPtr++ = m_workSize.y();
2617 uniformInputData[2] = *bufferPtr++ = m_workSize.z();
2618 flushAlloc(vk, device, bufferAllocation);
2619 }
2620
2621 {
2622 de::Random rnd(0x82ce7f);
2623 const Allocation& bufferAllocation = buffer.getAllocation();
2624 deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2625 for (deUint32 i = 0; i < m_numValues; ++i)
2626 inputData[i] = *bufferPtr++ = rnd.getUint32();
2627
2628 flushAlloc(vk, device, bufferAllocation);
2629 }
2630
2631 // Create descriptor set
2632 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2633 DescriptorSetLayoutBuilder()
2634 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2635 .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2636 .build(vk, device));
2637
2638 const Unique<VkDescriptorPool> descriptorPool(
2639 DescriptorPoolBuilder()
2640 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2641 .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2642 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2643
2644 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2645
2646 const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
2647 const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2648
2649 DescriptorSetUpdateBuilder()
2650 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2651 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2652 .update(vk, device);
2653
2654 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
2655 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2656 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, static_cast<VkPipelineCreateFlags>(VK_PIPELINE_CREATE_DISPATCH_BASE), *shaderModule, static_cast<VkPipelineShaderStageCreateFlags>(0u)));
2657
2658 const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer, 0ull, bufferSizeBytes);
2659 const VkBufferMemoryBarrier hostUniformWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2660
2661 const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
2662
2663 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, m_queueFamilyIndex));
2664 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2665
2666 // Start recording commands
2667 beginCommandBuffer(vk, *cmdBuffer);
2668
2669 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
2670 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2671
2672 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostUniformWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2673
2674 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2675
2676 // Split the workload across all physical devices based on m_splitWorkSize.x()
2677 for (deUint32 physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2678 {
2679 deUint32 baseGroupX = physDevIdx * m_splitWorkSize.x();
2680 deUint32 baseGroupY = 0;
2681 deUint32 baseGroupZ = 0;
2682
2683 // Split the workload within the physical device based on m_localSize.y() and m_localSize.z()
2684 for (deInt32 localIdxY = 0; localIdxY < (m_splitWorkSize.y() / m_localSize.y()); localIdxY++)
2685 {
2686 for (deInt32 localIdxZ = 0; localIdxZ < (m_splitWorkSize.z() / m_localSize.z()); localIdxZ++)
2687 {
2688 deUint32 offsetX = baseGroupX;
2689 deUint32 offsetY = baseGroupY + localIdxY * m_localSize.y();
2690 deUint32 offsetZ = baseGroupZ + localIdxZ * m_localSize.z();
2691
2692 deUint32 localSizeX = (physDevIdx == (m_numPhysDevices - 1)) ? m_workSize.x() - baseGroupX : m_localSize.x();
2693 deUint32 localSizeY = m_localSize.y();
2694 deUint32 localSizeZ = m_localSize.z();
2695
2696 totalWorkloadSize += (localSizeX * localSizeY * localSizeZ);
2697 vk.cmdDispatchBase(*cmdBuffer, offsetX, offsetY, offsetZ, localSizeX, localSizeY, localSizeZ);
2698 }
2699 }
2700 }
2701
2702 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2703
2704 endCommandBuffer(vk, *cmdBuffer);
2705 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2706
2707 if (totalWorkloadSize != deUint32(multiplyComponents(m_workSize)))
2708 TCU_THROW(TestError, "Not covering the entire workload.");
2709
2710 // Validate the results
2711 const Allocation& bufferAllocation = buffer.getAllocation();
2712 invalidateAlloc(vk, device, bufferAllocation);
2713 const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2714
2715 for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
2716 {
2717 const deUint32 res = bufferPtr[ndx];
2718 const deUint32 ref = ~inputData[ndx];
2719
2720 if (res != ref)
2721 {
2722 std::ostringstream msg;
2723 msg << "Comparison failed for InOut.values[" << ndx << "]";
2724 return tcu::TestStatus::fail(msg.str());
2725 }
2726 }
2727 return tcu::TestStatus::pass("Compute succeeded");
2728 }
2729
2730 class DeviceIndexTest : public vkt::TestCase
2731 {
2732 public:
2733 DeviceIndexTest (tcu::TestContext& testCtx,
2734 const std::string& name,
2735 const std::string& description,
2736 const deUint32 numValues,
2737 const tcu::IVec3& localsize,
2738 const tcu::IVec3& splitsize);
2739
2740 void initPrograms (SourceCollections& sourceCollections) const;
2741 TestInstance* createInstance (Context& context) const;
2742
2743 private:
2744 const deUint32 m_numValues;
2745 const tcu::IVec3 m_localSize;
2746 const tcu::IVec3 m_workSize;
2747 const tcu::IVec3 m_splitSize;
2748 };
2749
2750 class DeviceIndexTestInstance : public ComputeTestInstance
2751 {
2752 public:
2753 DeviceIndexTestInstance (Context& context,
2754 const deUint32 numValues,
2755 const tcu::IVec3& localsize,
2756 const tcu::IVec3& worksize);
2757 tcu::TestStatus iterate (void);
2758 private:
2759 const deUint32 m_numValues;
2760 const tcu::IVec3 m_localSize;
2761 tcu::IVec3 m_workSize;
2762 };
2763
DeviceIndexTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize)2764 DeviceIndexTest::DeviceIndexTest (tcu::TestContext& testCtx,
2765 const std::string& name,
2766 const std::string& description,
2767 const deUint32 numValues,
2768 const tcu::IVec3& localsize,
2769 const tcu::IVec3& worksize)
2770 : TestCase (testCtx, name, description)
2771 , m_numValues (numValues)
2772 , m_localSize (localsize)
2773 , m_workSize (worksize)
2774 {
2775 }
2776
initPrograms(SourceCollections & sourceCollections) const2777 void DeviceIndexTest::initPrograms (SourceCollections& sourceCollections) const
2778 {
2779 std::ostringstream src;
2780 src << "#version 310 es\n"
2781 << "#extension GL_EXT_device_group : require\n"
2782 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
2783
2784 << "layout(binding = 0) buffer InOut {\n"
2785 << " uint values[" << de::toString(m_numValues) << "];\n"
2786 << "} sb_inout;\n"
2787
2788 << "layout(binding = 1) readonly uniform uniformInput {\n"
2789 << " uint baseOffset[1+" << VK_MAX_DEVICE_GROUP_SIZE << "];\n"
2790 << "} ubo_in;\n"
2791
2792 << "void main (void) {\n"
2793 << " uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
2794 << " uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
2795 << " uint index = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
2796 << " uint offset = numValuesPerInv*index;\n"
2797 << " for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
2798 << " sb_inout.values[offset + ndx] = ubo_in.baseOffset[0] + ubo_in.baseOffset[gl_DeviceIndex + 1];\n"
2799 << "}\n";
2800
2801 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
2802 }
2803
createInstance(Context & context) const2804 TestInstance* DeviceIndexTest::createInstance (Context& context) const
2805 {
2806 return new DeviceIndexTestInstance(context, m_numValues, m_localSize, m_workSize);
2807 }
2808
DeviceIndexTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize)2809 DeviceIndexTestInstance::DeviceIndexTestInstance (Context& context,
2810 const deUint32 numValues,
2811 const tcu::IVec3& localsize,
2812 const tcu::IVec3& worksize)
2813
2814 : ComputeTestInstance (context)
2815 , m_numValues (numValues)
2816 , m_localSize (localsize)
2817 , m_workSize (worksize)
2818 {}
2819
iterate(void)2820 tcu::TestStatus DeviceIndexTestInstance::iterate (void)
2821 {
2822 const DeviceInterface& vk = getDeviceInterface();
2823 const VkDevice device = getDevice();
2824 const VkQueue queue = getDeviceQueue(vk, device, m_queueFamilyIndex, 0);
2825 SimpleAllocator allocator (vk, device, getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice()));
2826 const deUint32 allocDeviceMask = (1 << m_numPhysDevices) - 1;
2827 de::Random rnd (0x82ce7f);
2828 Move<VkBuffer> sboBuffer;
2829 vk::Move<vk::VkDeviceMemory> sboBufferMemory;
2830
2831 // Create an uniform and output buffer
2832 const deUint32 uniformBufSize = 4 * (1 + VK_MAX_DEVICE_GROUP_SIZE);
2833 const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32) * uniformBufSize;
2834 const BufferWithMemory uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
2835
2836 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
2837 const BufferWithMemory checkBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT), MemoryRequirement::HostVisible);
2838
2839 // create SBO buffer
2840 {
2841 const VkBufferCreateInfo sboBufferParams =
2842 {
2843 VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, // sType
2844 DE_NULL, // pNext
2845 0u, // flags
2846 (VkDeviceSize)bufferSizeBytes, // size
2847 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT, // usage
2848 VK_SHARING_MODE_EXCLUSIVE, // sharingMode
2849 1u, // queueFamilyIndexCount
2850 &m_queueFamilyIndex, // pQueueFamilyIndices
2851 };
2852 sboBuffer = createBuffer(vk, device, &sboBufferParams);
2853
2854 VkMemoryRequirements memReqs = getBufferMemoryRequirements(vk, device, sboBuffer.get());
2855 deUint32 memoryTypeNdx = 0;
2856 const VkPhysicalDeviceMemoryProperties deviceMemProps = getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice());
2857 for ( memoryTypeNdx = 0; memoryTypeNdx < deviceMemProps.memoryTypeCount; memoryTypeNdx++)
2858 {
2859 if ((memReqs.memoryTypeBits & (1u << memoryTypeNdx)) != 0 &&
2860 (deviceMemProps.memoryTypes[memoryTypeNdx].propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) == VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)
2861 break;
2862 }
2863 if (memoryTypeNdx == deviceMemProps.memoryTypeCount)
2864 TCU_THROW(NotSupportedError, "No compatible memory type found");
2865
2866 const VkMemoryAllocateFlagsInfo allocDeviceMaskInfo =
2867 {
2868 VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO, // sType
2869 DE_NULL, // pNext
2870 VK_MEMORY_ALLOCATE_DEVICE_MASK_BIT, // flags
2871 allocDeviceMask, // deviceMask
2872 };
2873
2874 VkMemoryAllocateInfo allocInfo =
2875 {
2876 VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, // sType
2877 &allocDeviceMaskInfo, // pNext
2878 memReqs.size, // allocationSize
2879 memoryTypeNdx, // memoryTypeIndex
2880 };
2881
2882 sboBufferMemory = allocateMemory(vk, device, &allocInfo);
2883 VK_CHECK(vk.bindBufferMemory(device, *sboBuffer, sboBufferMemory.get(), 0));
2884 }
2885
2886 // Fill the buffers with data
2887 typedef std::vector<deUint32> data_vector_t;
2888 data_vector_t uniformInputData(uniformBufSize, 0);
2889
2890 {
2891 const Allocation& bufferAllocation = uniformBuffer.getAllocation();
2892 deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2893 for (deUint32 i = 0; i < uniformBufSize; ++i)
2894 uniformInputData[i] = *bufferPtr++ = rnd.getUint32() / 10; // divide to prevent overflow in addition
2895
2896 flushAlloc(vk, device, bufferAllocation);
2897 }
2898
2899 // Create descriptor set
2900 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2901 DescriptorSetLayoutBuilder()
2902 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2903 .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2904 .build(vk, device));
2905
2906 const Unique<VkDescriptorPool> descriptorPool(
2907 DescriptorPoolBuilder()
2908 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2909 .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2910 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2911
2912 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2913
2914 const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*sboBuffer, 0ull, bufferSizeBytes);
2915 const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2916
2917 DescriptorSetUpdateBuilder()
2918 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2919 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2920 .update(vk, device);
2921
2922 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
2923 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2924 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
2925
2926 const VkBufferMemoryBarrier hostUniformWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2927 const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT , *sboBuffer, 0ull, bufferSizeBytes);
2928
2929 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, m_queueFamilyIndex));
2930 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2931
2932 // Verify multiple device masks
2933 for (deUint32 physDevMask = 1; physDevMask < (1u << m_numPhysDevices); physDevMask++)
2934 {
2935 deUint32 constantValPerLoop = 0;
2936 {
2937 const Allocation& bufferAllocation = uniformBuffer.getAllocation();
2938 deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2939 constantValPerLoop = *bufferPtr = rnd.getUint32() / 10; // divide to prevent overflow in addition
2940 flushAlloc(vk, device, bufferAllocation);
2941 }
2942 beginCommandBuffer(vk, *cmdBuffer);
2943
2944 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
2945 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2946 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostUniformWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2947
2948 vk.cmdSetDeviceMask(*cmdBuffer, physDevMask);
2949 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
2950
2951 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2952
2953 endCommandBuffer(vk, *cmdBuffer);
2954 submitCommandsAndWait(vk, device, queue, *cmdBuffer, true, physDevMask);
2955 m_context.resetCommandPoolForVKSC(device, *cmdPool);
2956
2957 // Validate the results on all physical devices where compute shader was launched
2958 const VkBufferMemoryBarrier srcBufferBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT , *sboBuffer, 0ull, bufferSizeBytes);
2959 const VkBufferMemoryBarrier dstBufferBarrier = makeBufferMemoryBarrier(VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *checkBuffer, 0ull, bufferSizeBytes);
2960 const VkBufferCopy copyParams =
2961 {
2962 (VkDeviceSize)0u, // srcOffset
2963 (VkDeviceSize)0u, // dstOffset
2964 bufferSizeBytes // size
2965 };
2966
2967 for (deUint32 physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2968 {
2969 if (!(1<<physDevIdx & physDevMask))
2970 continue;
2971
2972 const deUint32 deviceMask = 1 << physDevIdx;
2973
2974 beginCommandBuffer(vk, *cmdBuffer);
2975 vk.cmdSetDeviceMask(*cmdBuffer, deviceMask);
2976 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT , VK_PIPELINE_STAGE_TRANSFER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &srcBufferBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2977 vk.cmdCopyBuffer(*cmdBuffer, *sboBuffer, *checkBuffer, 1, ©Params);
2978 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &dstBufferBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2979
2980 endCommandBuffer(vk, *cmdBuffer);
2981 submitCommandsAndWait(vk, device, queue, *cmdBuffer, true, deviceMask);
2982
2983 const Allocation& bufferAllocation = checkBuffer.getAllocation();
2984 invalidateAlloc(vk, device, bufferAllocation);
2985 const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2986
2987 for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
2988 {
2989 const deUint32 res = bufferPtr[ndx];
2990 const deUint32 ref = constantValPerLoop + uniformInputData[4 * (physDevIdx + 1)];
2991
2992 if (res != ref)
2993 {
2994 std::ostringstream msg;
2995 msg << "Comparison failed on physical device "<< getPhysicalDevice(physDevIdx) <<" ( deviceMask "<< deviceMask <<" ) for InOut.values[" << ndx << "]";
2996 return tcu::TestStatus::fail(msg.str());
2997 }
2998 }
2999 }
3000 }
3001
3002 return tcu::TestStatus::pass("Compute succeeded");
3003 }
3004
3005 class ConcurrentCompute : public vkt::TestCase
3006 {
3007 public:
3008 ConcurrentCompute (tcu::TestContext& testCtx,
3009 const std::string& name,
3010 const std::string& description);
3011
3012
3013 void initPrograms (SourceCollections& sourceCollections) const;
3014 TestInstance* createInstance (Context& context) const;
3015 };
3016
3017 class ConcurrentComputeInstance : public vkt::TestInstance
3018 {
3019 public:
3020 ConcurrentComputeInstance (Context& context);
3021
3022 tcu::TestStatus iterate (void);
3023 };
3024
ConcurrentCompute(tcu::TestContext & testCtx,const std::string & name,const std::string & description)3025 ConcurrentCompute::ConcurrentCompute (tcu::TestContext& testCtx,
3026 const std::string& name,
3027 const std::string& description)
3028 : TestCase (testCtx, name, description)
3029 {
3030 }
3031
initPrograms(SourceCollections & sourceCollections) const3032 void ConcurrentCompute::initPrograms (SourceCollections& sourceCollections) const
3033 {
3034 std::ostringstream src;
3035 src << "#version 310 es\n"
3036 << "layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;\n"
3037 << "layout(binding = 0) buffer InOut {\n"
3038 << " uint values[1024];\n"
3039 << "} sb_inout;\n"
3040 << "void main (void) {\n"
3041 << " uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
3042 << " uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
3043 << " uint groupNdx = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
3044 << " uint offset = numValuesPerInv*groupNdx;\n"
3045 << "\n"
3046 << " for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
3047 << " sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
3048 << "}\n";
3049
3050 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
3051 }
3052
createInstance(Context & context) const3053 TestInstance* ConcurrentCompute::createInstance (Context& context) const
3054 {
3055 return new ConcurrentComputeInstance(context);
3056 }
3057
ConcurrentComputeInstance(Context & context)3058 ConcurrentComputeInstance::ConcurrentComputeInstance (Context& context)
3059 : TestInstance (context)
3060 {
3061 }
3062
iterate(void)3063 tcu::TestStatus ConcurrentComputeInstance::iterate (void)
3064 {
3065 enum {
3066 NO_MATCH_FOUND = ~((deUint32)0),
3067 ERROR_NONE = 0,
3068 ERROR_WAIT = 1,
3069 ERROR_ORDER = 2
3070 };
3071
3072 struct Queues
3073 {
3074 VkQueue queue;
3075 deUint32 queueFamilyIndex;
3076 };
3077
3078 // const DeviceInterface& vk = m_context.getDeviceInterface();
3079 const deUint32 numValues = 1024;
3080 const CustomInstance instance (createCustomInstanceFromContext(m_context));
3081 const InstanceDriver& instanceDriver (instance.getDriver());
3082 const VkPhysicalDevice physicalDevice = chooseDevice(instanceDriver, instance, m_context.getTestContext().getCommandLine());
3083 tcu::TestLog& log = m_context.getTestContext().getLog();
3084 vk::Move<vk::VkDevice> logicalDevice;
3085 std::vector<VkQueueFamilyProperties> queueFamilyProperties;
3086 VkDeviceCreateInfo deviceInfo;
3087 VkPhysicalDeviceFeatures deviceFeatures;
3088 const float queuePriorities[2] = {1.0f, 0.0f};
3089 VkDeviceQueueCreateInfo queueInfos[2];
3090 Queues queues[2] =
3091 {
3092 {DE_NULL, (deUint32)NO_MATCH_FOUND},
3093 {DE_NULL, (deUint32)NO_MATCH_FOUND}
3094 };
3095
3096 queueFamilyProperties = getPhysicalDeviceQueueFamilyProperties(instanceDriver, physicalDevice);
3097
3098 for (deUint32 queueNdx = 0; queueNdx < queueFamilyProperties.size(); ++queueNdx)
3099 {
3100 if (queueFamilyProperties[queueNdx].queueFlags & VK_QUEUE_COMPUTE_BIT)
3101 {
3102 if (NO_MATCH_FOUND == queues[0].queueFamilyIndex)
3103 queues[0].queueFamilyIndex = queueNdx;
3104
3105 if (queues[0].queueFamilyIndex != queueNdx || queueFamilyProperties[queueNdx].queueCount > 1u)
3106 {
3107 queues[1].queueFamilyIndex = queueNdx;
3108 break;
3109 }
3110 }
3111 }
3112
3113 if (queues[0].queueFamilyIndex == NO_MATCH_FOUND || queues[1].queueFamilyIndex == NO_MATCH_FOUND)
3114 TCU_THROW(NotSupportedError, "Queues couldn't be created");
3115
3116 for (int queueNdx = 0; queueNdx < 2; ++queueNdx)
3117 {
3118 VkDeviceQueueCreateInfo queueInfo;
3119 deMemset(&queueInfo, 0, sizeof(queueInfo));
3120
3121 queueInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
3122 queueInfo.pNext = DE_NULL;
3123 queueInfo.flags = (VkDeviceQueueCreateFlags)0u;
3124 queueInfo.queueFamilyIndex = queues[queueNdx].queueFamilyIndex;
3125 queueInfo.queueCount = (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex) ? 2 : 1;
3126 queueInfo.pQueuePriorities = (queueInfo.queueCount == 2) ? queuePriorities : &queuePriorities[queueNdx];
3127
3128 queueInfos[queueNdx] = queueInfo;
3129
3130 if (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex)
3131 break;
3132 }
3133
3134 void* pNext = DE_NULL;
3135 #ifdef CTS_USES_VULKANSC
3136 VkDeviceObjectReservationCreateInfo memReservationInfo = m_context.getTestContext().getCommandLine().isSubProcess() ? m_context.getResourceInterface()->getStatMax() : resetDeviceObjectReservationCreateInfo();
3137 memReservationInfo.pNext = pNext;
3138 pNext = &memReservationInfo;
3139
3140 VkPhysicalDeviceVulkanSC10Features sc10Features = createDefaultSC10Features();
3141 sc10Features.pNext = pNext;
3142 pNext = &sc10Features;
3143
3144 VkPipelineCacheCreateInfo pcCI;
3145 std::vector<VkPipelinePoolSize> poolSizes;
3146 if (m_context.getTestContext().getCommandLine().isSubProcess())
3147 {
3148 if (m_context.getResourceInterface()->getCacheDataSize() > 0)
3149 {
3150 pcCI =
3151 {
3152 VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO, // VkStructureType sType;
3153 DE_NULL, // const void* pNext;
3154 VK_PIPELINE_CACHE_CREATE_READ_ONLY_BIT |
3155 VK_PIPELINE_CACHE_CREATE_USE_APPLICATION_STORAGE_BIT, // VkPipelineCacheCreateFlags flags;
3156 m_context.getResourceInterface()->getCacheDataSize(), // deUintptr initialDataSize;
3157 m_context.getResourceInterface()->getCacheData() // const void* pInitialData;
3158 };
3159 memReservationInfo.pipelineCacheCreateInfoCount = 1;
3160 memReservationInfo.pPipelineCacheCreateInfos = &pcCI;
3161 }
3162
3163 poolSizes = m_context.getResourceInterface()->getPipelinePoolSizes();
3164 if (!poolSizes.empty())
3165 {
3166 memReservationInfo.pipelinePoolSizeCount = deUint32(poolSizes.size());
3167 memReservationInfo.pPipelinePoolSizes = poolSizes.data();
3168 }
3169 }
3170 #endif // CTS_USES_VULKANSC
3171
3172 deMemset(&deviceInfo, 0, sizeof(deviceInfo));
3173 instanceDriver.getPhysicalDeviceFeatures(physicalDevice, &deviceFeatures);
3174
3175 deviceInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
3176 deviceInfo.pNext = pNext;
3177 deviceInfo.enabledExtensionCount = 0u;
3178 deviceInfo.ppEnabledExtensionNames = DE_NULL;
3179 deviceInfo.enabledLayerCount = 0u;
3180 deviceInfo.ppEnabledLayerNames = DE_NULL;
3181 deviceInfo.pEnabledFeatures = &deviceFeatures;
3182 deviceInfo.queueCreateInfoCount = (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex) ? 1 : 2;
3183 deviceInfo.pQueueCreateInfos = queueInfos;
3184
3185 logicalDevice = createCustomDevice (m_context.getTestContext().getCommandLine().isValidationEnabled(), m_context.getPlatformInterface(), instance, instanceDriver, physicalDevice, &deviceInfo);
3186
3187 #ifndef CTS_USES_VULKANSC
3188 de::MovePtr<vk::DeviceDriver> deviceDriver = de::MovePtr<DeviceDriver>(new DeviceDriver(m_context.getPlatformInterface(), instance, *logicalDevice));
3189 #else
3190 de::MovePtr<vk::DeviceDriverSC, vk::DeinitDeviceDeleter> deviceDriver = de::MovePtr<DeviceDriverSC, DeinitDeviceDeleter>(new DeviceDriverSC(m_context.getPlatformInterface(), instance, *logicalDevice, m_context.getTestContext().getCommandLine(), m_context.getResourceInterface(), m_context.getDeviceVulkanSC10Properties(), m_context.getDeviceProperties()), vk::DeinitDeviceDeleter(m_context.getResourceInterface().get(), *logicalDevice));
3191 #endif // CTS_USES_VULKANSC
3192 vk::DeviceInterface& vk = *deviceDriver;
3193
3194 for (deUint32 queueReqNdx = 0; queueReqNdx < 2; ++queueReqNdx)
3195 {
3196 if (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex)
3197 vk.getDeviceQueue(*logicalDevice, queues[queueReqNdx].queueFamilyIndex, queueReqNdx, &queues[queueReqNdx].queue);
3198 else
3199 vk.getDeviceQueue(*logicalDevice, queues[queueReqNdx].queueFamilyIndex, 0u, &queues[queueReqNdx].queue);
3200 }
3201
3202 // Create an input/output buffers
3203 const VkPhysicalDeviceMemoryProperties memoryProperties = vk::getPhysicalDeviceMemoryProperties(instanceDriver, physicalDevice);
3204
3205 de::MovePtr<SimpleAllocator> allocator = de::MovePtr<SimpleAllocator>(new SimpleAllocator(vk, *logicalDevice, memoryProperties));
3206 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * numValues;
3207 const BufferWithMemory buffer1(vk, *logicalDevice, *allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
3208 const BufferWithMemory buffer2(vk, *logicalDevice, *allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
3209
3210 // Fill the buffers with data
3211
3212 typedef std::vector<deUint32> data_vector_t;
3213 data_vector_t inputData(numValues);
3214
3215 {
3216 de::Random rnd(0x82ce7f);
3217 const Allocation& bufferAllocation1 = buffer1.getAllocation();
3218 const Allocation& bufferAllocation2 = buffer2.getAllocation();
3219 deUint32* bufferPtr1 = static_cast<deUint32*>(bufferAllocation1.getHostPtr());
3220 deUint32* bufferPtr2 = static_cast<deUint32*>(bufferAllocation2.getHostPtr());
3221
3222 for (deUint32 i = 0; i < numValues; ++i)
3223 {
3224 deUint32 val = rnd.getUint32();
3225 inputData[i] = val;
3226 *bufferPtr1++ = val;
3227 *bufferPtr2++ = val;
3228 }
3229
3230 flushAlloc(vk, *logicalDevice, bufferAllocation1);
3231 flushAlloc(vk, *logicalDevice, bufferAllocation2);
3232 }
3233
3234 // Create descriptor sets
3235
3236 const Unique<VkDescriptorSetLayout> descriptorSetLayout1(
3237 DescriptorSetLayoutBuilder()
3238 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3239 .build(vk, *logicalDevice));
3240
3241 const Unique<VkDescriptorPool> descriptorPool1(
3242 DescriptorPoolBuilder()
3243 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
3244 .build(vk, *logicalDevice, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
3245
3246 const Unique<VkDescriptorSet> descriptorSet1(makeDescriptorSet(vk, *logicalDevice, *descriptorPool1, *descriptorSetLayout1));
3247
3248 const VkDescriptorBufferInfo bufferDescriptorInfo1 = makeDescriptorBufferInfo(*buffer1, 0ull, bufferSizeBytes);
3249 DescriptorSetUpdateBuilder()
3250 .writeSingle(*descriptorSet1, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo1)
3251 .update(vk, *logicalDevice);
3252
3253 const Unique<VkDescriptorSetLayout> descriptorSetLayout2(
3254 DescriptorSetLayoutBuilder()
3255 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3256 .build(vk, *logicalDevice));
3257
3258 const Unique<VkDescriptorPool> descriptorPool2(
3259 DescriptorPoolBuilder()
3260 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
3261 .build(vk, *logicalDevice, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
3262
3263 const Unique<VkDescriptorSet> descriptorSet2(makeDescriptorSet(vk, *logicalDevice, *descriptorPool2, *descriptorSetLayout2));
3264
3265 const VkDescriptorBufferInfo bufferDescriptorInfo2 = makeDescriptorBufferInfo(*buffer2, 0ull, bufferSizeBytes);
3266 DescriptorSetUpdateBuilder()
3267 .writeSingle(*descriptorSet2, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo2)
3268 .update(vk, *logicalDevice);
3269
3270 // Perform the computation
3271
3272 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, *logicalDevice, m_context.getBinaryCollection().get("comp"), 0u));
3273
3274 const Unique<VkPipelineLayout> pipelineLayout1(makePipelineLayout(vk, *logicalDevice, *descriptorSetLayout1));
3275 const Unique<VkPipeline> pipeline1(makeComputePipeline(vk, *logicalDevice, *pipelineLayout1, *shaderModule));
3276 const VkBufferMemoryBarrier hostWriteBarrier1 = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer1, 0ull, bufferSizeBytes);
3277 const VkBufferMemoryBarrier shaderWriteBarrier1 = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer1, 0ull, bufferSizeBytes);
3278 const Unique<VkCommandPool> cmdPool1(makeCommandPool(vk, *logicalDevice, queues[0].queueFamilyIndex));
3279 const Unique<VkCommandBuffer> cmdBuffer1(allocateCommandBuffer(vk, *logicalDevice, *cmdPool1, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3280
3281 const Unique<VkPipelineLayout> pipelineLayout2(makePipelineLayout(vk, *logicalDevice, *descriptorSetLayout2));
3282 const Unique<VkPipeline> pipeline2(makeComputePipeline(vk, *logicalDevice, *pipelineLayout2, *shaderModule));
3283 const VkBufferMemoryBarrier hostWriteBarrier2 = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer2, 0ull, bufferSizeBytes);
3284 const VkBufferMemoryBarrier shaderWriteBarrier2 = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer2, 0ull, bufferSizeBytes);
3285 const Unique<VkCommandPool> cmdPool2(makeCommandPool(vk, *logicalDevice, queues[1].queueFamilyIndex));
3286 const Unique<VkCommandBuffer> cmdBuffer2(allocateCommandBuffer(vk, *logicalDevice, *cmdPool2, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3287
3288 // Command buffer 1
3289
3290 beginCommandBuffer(vk, *cmdBuffer1);
3291 vk.cmdBindPipeline(*cmdBuffer1, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline1);
3292 vk.cmdBindDescriptorSets(*cmdBuffer1, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout1, 0u, 1u, &descriptorSet1.get(), 0u, DE_NULL);
3293 vk.cmdPipelineBarrier(*cmdBuffer1, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier1, 0, (const VkImageMemoryBarrier*)DE_NULL);
3294 vk.cmdDispatch(*cmdBuffer1, 1, 1, 1);
3295 vk.cmdPipelineBarrier(*cmdBuffer1, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier1, 0, (const VkImageMemoryBarrier*)DE_NULL);
3296 endCommandBuffer(vk, *cmdBuffer1);
3297
3298 // Command buffer 2
3299
3300 beginCommandBuffer(vk, *cmdBuffer2);
3301 vk.cmdBindPipeline(*cmdBuffer2, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline2);
3302 vk.cmdBindDescriptorSets(*cmdBuffer2, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout2, 0u, 1u, &descriptorSet2.get(), 0u, DE_NULL);
3303 vk.cmdPipelineBarrier(*cmdBuffer2, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier2, 0, (const VkImageMemoryBarrier*)DE_NULL);
3304 vk.cmdDispatch(*cmdBuffer2, 1, 1, 1);
3305 vk.cmdPipelineBarrier(*cmdBuffer2, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier2, 0, (const VkImageMemoryBarrier*)DE_NULL);
3306 endCommandBuffer(vk, *cmdBuffer2);
3307
3308 VkSubmitInfo submitInfo1 =
3309 {
3310 VK_STRUCTURE_TYPE_SUBMIT_INFO, // sType
3311 DE_NULL, // pNext
3312 0u, // waitSemaphoreCount
3313 DE_NULL, // pWaitSemaphores
3314 (const VkPipelineStageFlags*)DE_NULL, // pWaitDstStageMask
3315 1u, // commandBufferCount
3316 &cmdBuffer1.get(), // pCommandBuffers
3317 0u, // signalSemaphoreCount
3318 DE_NULL // pSignalSemaphores
3319 };
3320
3321 VkSubmitInfo submitInfo2 =
3322 {
3323 VK_STRUCTURE_TYPE_SUBMIT_INFO, // sType
3324 DE_NULL, // pNext
3325 0u, // waitSemaphoreCount
3326 DE_NULL, // pWaitSemaphores
3327 (const VkPipelineStageFlags*)DE_NULL, // pWaitDstStageMask
3328 1u, // commandBufferCount
3329 &cmdBuffer2.get(), // pCommandBuffers
3330 0u, // signalSemaphoreCount
3331 DE_NULL // pSignalSemaphores
3332 };
3333
3334 // Wait for completion
3335 const Unique<VkFence> fence1(createFence(vk, *logicalDevice));
3336 const Unique<VkFence> fence2(createFence(vk, *logicalDevice));
3337
3338 VK_CHECK(vk.queueSubmit(queues[0].queue, 1u, &submitInfo1, *fence1));
3339 VK_CHECK(vk.queueSubmit(queues[1].queue, 1u, &submitInfo2, *fence2));
3340
3341 int err = ERROR_NONE;
3342
3343 // First wait for the low-priority queue
3344 if (VK_SUCCESS != vk.waitForFences(*logicalDevice, 1u, &fence2.get(), DE_TRUE, ~0ull))
3345 err = ERROR_WAIT;
3346
3347 // If the high-priority queue hasn't finished, we have a problem.
3348 if (VK_SUCCESS != vk.getFenceStatus(*logicalDevice, fence1.get()))
3349 if (err == ERROR_NONE)
3350 err = ERROR_ORDER;
3351
3352 // Wait for the high-priority fence so we don't get errors on teardown.
3353 vk.waitForFences(*logicalDevice, 1u, &fence1.get(), DE_TRUE, ~0ull);
3354
3355 // If we fail() before waiting for all of the fences, error will come from
3356 // teardown instead of the error we want.
3357
3358 if (err == ERROR_WAIT)
3359 {
3360 return tcu::TestStatus::fail("Failed waiting for low-priority queue fence.");
3361 }
3362
3363 // Validate the results
3364
3365 const Allocation& bufferAllocation1 = buffer1.getAllocation();
3366 invalidateAlloc(vk, *logicalDevice, bufferAllocation1);
3367 const deUint32* bufferPtr1 = static_cast<deUint32*>(bufferAllocation1.getHostPtr());
3368
3369 const Allocation& bufferAllocation2 = buffer2.getAllocation();
3370 invalidateAlloc(vk, *logicalDevice, bufferAllocation2);
3371 const deUint32* bufferPtr2 = static_cast<deUint32*>(bufferAllocation2.getHostPtr());
3372
3373 for (deUint32 ndx = 0; ndx < numValues; ++ndx)
3374 {
3375 const deUint32 res1 = bufferPtr1[ndx];
3376 const deUint32 res2 = bufferPtr2[ndx];
3377 const deUint32 inp = inputData[ndx];
3378 const deUint32 ref = ~inp;
3379
3380 if (res1 != ref || res1 != res2)
3381 {
3382 std::ostringstream msg;
3383 msg << "Comparison failed for InOut.values[" << ndx << "] ref:" << ref <<" res1:" << res1 << " res2:" << res2 << " inp:" << inp;
3384 return tcu::TestStatus::fail(msg.str());
3385 }
3386 }
3387
3388 if (err == ERROR_ORDER)
3389 log << tcu::TestLog::Message << "Note: Low-priority queue was faster than high-priority one. This is not an error, but priorities may be inverted." << tcu::TestLog::EndMessage;
3390
3391 return tcu::TestStatus::pass("Test passed");
3392 }
3393
3394 class EmptyWorkGroupCase : public vkt::TestCase
3395 {
3396 public:
3397 EmptyWorkGroupCase (tcu::TestContext& testCtx, const std::string& name, const std::string& description, const tcu::UVec3& dispatchSize);
~EmptyWorkGroupCase(void)3398 virtual ~EmptyWorkGroupCase (void) {}
3399
3400 TestInstance* createInstance (Context& context) const override;
3401 void initPrograms (vk::SourceCollections& programCollection) const override;
3402
3403 protected:
3404 const tcu::UVec3 m_dispatchSize;
3405 };
3406
3407 class EmptyWorkGroupInstance : public vkt::TestInstance
3408 {
3409 public:
EmptyWorkGroupInstance(Context & context,const tcu::UVec3 & dispatchSize)3410 EmptyWorkGroupInstance (Context& context, const tcu::UVec3& dispatchSize)
3411 : vkt::TestInstance (context)
3412 , m_dispatchSize (dispatchSize)
3413 {}
~EmptyWorkGroupInstance(void)3414 virtual ~EmptyWorkGroupInstance (void) {}
3415
3416 tcu::TestStatus iterate (void) override;
3417
3418 protected:
3419 const tcu::UVec3 m_dispatchSize;
3420 };
3421
EmptyWorkGroupCase(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::UVec3 & dispatchSize)3422 EmptyWorkGroupCase::EmptyWorkGroupCase (tcu::TestContext& testCtx, const std::string& name, const std::string& description, const tcu::UVec3& dispatchSize)
3423 : vkt::TestCase (testCtx, name, description)
3424 , m_dispatchSize (dispatchSize)
3425 {
3426 DE_ASSERT(m_dispatchSize.x() == 0u || m_dispatchSize.y() == 0u || m_dispatchSize.z() == 0u);
3427 }
3428
createInstance(Context & context) const3429 TestInstance* EmptyWorkGroupCase::createInstance (Context& context) const
3430 {
3431 return new EmptyWorkGroupInstance(context, m_dispatchSize);
3432 }
3433
initPrograms(vk::SourceCollections & programCollection) const3434 void EmptyWorkGroupCase::initPrograms (vk::SourceCollections& programCollection) const
3435 {
3436 std::ostringstream comp;
3437 comp
3438 << "#version 450\n"
3439 << "layout (local_size_x=1, local_size_y=1, local_size_z=1) in;\n"
3440 << "layout (set=0, binding=0) buffer VerificationBlock { uint value; } verif;\n"
3441 << "void main () { atomicAdd(verif.value, 1u); }\n"
3442 ;
3443 programCollection.glslSources.add("comp") << glu::ComputeSource(comp.str());
3444 }
3445
iterate(void)3446 tcu::TestStatus EmptyWorkGroupInstance::iterate (void)
3447 {
3448 const auto& vkd = m_context.getDeviceInterface();
3449 const auto device = m_context.getDevice();
3450 auto& alloc = m_context.getDefaultAllocator();
3451 const auto queueIndex = m_context.getUniversalQueueFamilyIndex();
3452 const auto queue = m_context.getUniversalQueue();
3453
3454 const auto verifBufferSize = static_cast<VkDeviceSize>(sizeof(uint32_t));
3455 const auto verifBufferInfo = makeBufferCreateInfo(verifBufferSize, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
3456 BufferWithMemory verifBuffer (vkd, device, alloc, verifBufferInfo, MemoryRequirement::HostVisible);
3457 auto& verifBufferAlloc = verifBuffer.getAllocation();
3458 void* verifBufferPtr = verifBufferAlloc.getHostPtr();
3459
3460 deMemset(verifBufferPtr, 0, static_cast<size_t>(verifBufferSize));
3461 flushAlloc(vkd, device, verifBufferAlloc);
3462
3463 DescriptorSetLayoutBuilder layoutBuilder;
3464 layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT);
3465 const auto descriptorSetLayout = layoutBuilder.build(vkd, device);
3466
3467 const auto pipelineLayout = makePipelineLayout(vkd, device, descriptorSetLayout.get());
3468 const auto shaderModule = createShaderModule(vkd, device, m_context.getBinaryCollection().get("comp"), 0u);
3469 const auto pipeline = makeComputePipeline(vkd, device, pipelineLayout.get(), shaderModule.get());
3470
3471 DescriptorPoolBuilder poolBuilder;
3472 poolBuilder.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
3473 const auto descriptorPool = poolBuilder.build(vkd, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u);
3474 const auto descriptorSet = makeDescriptorSet(vkd, device, descriptorPool.get(), descriptorSetLayout.get());
3475
3476 DescriptorSetUpdateBuilder updateBuilder;
3477 const auto verifBufferDescInfo = makeDescriptorBufferInfo(verifBuffer.get(), 0ull, verifBufferSize);
3478 updateBuilder.writeSingle(descriptorSet.get(), DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &verifBufferDescInfo);
3479 updateBuilder.update(vkd, device);
3480
3481 const auto cmdPool = makeCommandPool(vkd, device, queueIndex);
3482 const auto cmdBufferPtr = allocateCommandBuffer(vkd, device, cmdPool.get(), VK_COMMAND_BUFFER_LEVEL_PRIMARY);
3483 const auto cmdBuffer = cmdBufferPtr.get();
3484
3485 beginCommandBuffer(vkd, cmdBuffer);
3486 vkd.cmdBindPipeline(cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.get());
3487 vkd.cmdBindDescriptorSets(cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayout.get(), 0u, 1u, &descriptorSet.get(), 0u, nullptr);
3488 vkd.cmdDispatch(cmdBuffer, m_dispatchSize.x(), m_dispatchSize.y(), m_dispatchSize.z());
3489
3490 const auto readWriteAccess = (VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_SHADER_READ_BIT);
3491 const auto computeToCompute = makeMemoryBarrier(readWriteAccess, readWriteAccess);
3492 vkd.cmdPipelineBarrier(cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0U, 1u, &computeToCompute, 0u, nullptr, 0u, nullptr);
3493
3494 vkd.cmdDispatch(cmdBuffer, 1u, 1u, 1u);
3495
3496 const auto computeToHost = makeMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT);
3497 vkd.cmdPipelineBarrier(cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0u, 1u, &computeToHost, 0u, nullptr, 0u, nullptr);
3498
3499 endCommandBuffer(vkd, cmdBuffer);
3500 submitCommandsAndWait(vkd, device, queue, cmdBuffer);
3501
3502 uint32_t value;
3503 invalidateAlloc(vkd, device, verifBufferAlloc);
3504 deMemcpy(&value, verifBufferPtr, sizeof(value));
3505
3506 if (value != 1u)
3507 {
3508 std::ostringstream msg;
3509 msg << "Unexpected value found in buffer: " << value << " while expecting 1";
3510 TCU_FAIL(msg.str());
3511 }
3512
3513 return tcu::TestStatus::pass("Pass");
3514 }
3515
3516 class MaxWorkGroupSizeTest : public vkt::TestCase
3517 {
3518 public:
3519 enum class Axis { X = 0, Y = 1, Z = 2 };
3520
3521 struct Params
3522 {
3523 // Which axis to maximize.
3524 Axis axis;
3525 };
3526
3527 MaxWorkGroupSizeTest (tcu::TestContext& testCtx, const std::string& name, const std::string& description, const Params& params);
~MaxWorkGroupSizeTest(void)3528 virtual ~MaxWorkGroupSizeTest (void) {}
3529
3530 virtual void initPrograms (vk::SourceCollections& programCollection) const;
3531 virtual TestInstance* createInstance (Context& context) const;
3532 virtual void checkSupport (Context& context) const;
3533
3534 // Helper to transform the axis value to an index.
3535 static int getIndex (Axis axis);
3536
3537 // Helper returning the number of invocations according to the test parameters.
3538 static deUint32 getInvocations (const Params& params, const vk::InstanceInterface& vki, vk::VkPhysicalDevice physicalDevice, const vk::VkPhysicalDeviceProperties* devProperties = nullptr);
3539
3540 // Helper returning the buffer size needed to this test.
3541 static deUint32 getSSBOSize (deUint32 invocations);
3542
3543 private:
3544 Params m_params;
3545 };
3546
3547 class MaxWorkGroupSizeInstance : public vkt::TestInstance
3548 {
3549 public:
3550 MaxWorkGroupSizeInstance (Context& context, const MaxWorkGroupSizeTest::Params& params);
~MaxWorkGroupSizeInstance(void)3551 virtual ~MaxWorkGroupSizeInstance (void) {}
3552
3553 virtual tcu::TestStatus iterate (void);
3554
3555 private:
3556 MaxWorkGroupSizeTest::Params m_params;
3557 };
3558
getIndex(Axis axis)3559 int MaxWorkGroupSizeTest::getIndex (Axis axis)
3560 {
3561 const int ret = static_cast<int>(axis);
3562 DE_ASSERT(ret >= static_cast<int>(Axis::X) && ret <= static_cast<int>(Axis::Z));
3563 return ret;
3564 }
3565
getInvocations(const Params & params,const vk::InstanceInterface & vki,vk::VkPhysicalDevice physicalDevice,const vk::VkPhysicalDeviceProperties * devProperties)3566 deUint32 MaxWorkGroupSizeTest::getInvocations (const Params& params, const vk::InstanceInterface& vki, vk::VkPhysicalDevice physicalDevice, const vk::VkPhysicalDeviceProperties* devProperties)
3567 {
3568 const auto axis = getIndex(params.axis);
3569
3570 if (devProperties)
3571 return devProperties->limits.maxComputeWorkGroupSize[axis];
3572 return vk::getPhysicalDeviceProperties(vki, physicalDevice).limits.maxComputeWorkGroupSize[axis];
3573 }
3574
getSSBOSize(deUint32 invocations)3575 deUint32 MaxWorkGroupSizeTest::getSSBOSize (deUint32 invocations)
3576 {
3577 return invocations * static_cast<deUint32>(sizeof(deUint32));
3578 }
3579
MaxWorkGroupSizeTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const Params & params)3580 MaxWorkGroupSizeTest::MaxWorkGroupSizeTest (tcu::TestContext& testCtx, const std::string& name, const std::string& description, const Params& params)
3581 : vkt::TestCase (testCtx, name, description)
3582 , m_params (params)
3583 {}
3584
initPrograms(vk::SourceCollections & programCollection) const3585 void MaxWorkGroupSizeTest::initPrograms (vk::SourceCollections& programCollection) const
3586 {
3587 std::ostringstream shader;
3588
3589 // The actual local sizes will be set using spec constants when running the test instance.
3590 shader
3591 << "#version 450\n"
3592 << "\n"
3593 << "layout(constant_id=0) const int local_size_x_val = 1;\n"
3594 << "layout(constant_id=1) const int local_size_y_val = 1;\n"
3595 << "layout(constant_id=2) const int local_size_z_val = 1;\n"
3596 << "\n"
3597 << "layout(local_size_x_id=0, local_size_y_id=1, local_size_z_id=2) in;\n"
3598 << "\n"
3599 << "layout(set=0, binding=0) buffer StorageBuffer {\n"
3600 << " uint values[];\n"
3601 << "} ssbo;\n"
3602 << "\n"
3603 << "void main() {\n"
3604 << " ssbo.values[gl_LocalInvocationIndex] = 1u;\n"
3605 << "}\n"
3606 ;
3607
3608 programCollection.glslSources.add("comp") << glu::ComputeSource(shader.str());
3609 }
3610
createInstance(Context & context) const3611 TestInstance* MaxWorkGroupSizeTest::createInstance (Context& context) const
3612 {
3613 return new MaxWorkGroupSizeInstance(context, m_params);
3614 }
3615
checkSupport(Context & context) const3616 void MaxWorkGroupSizeTest::checkSupport (Context& context) const
3617 {
3618 const auto& vki = context.getInstanceInterface();
3619 const auto physicalDevice = context.getPhysicalDevice();
3620
3621 const auto properties = vk::getPhysicalDeviceProperties(vki, physicalDevice);
3622 const auto invocations = getInvocations(m_params, vki, physicalDevice, &properties);
3623
3624 if (invocations > properties.limits.maxComputeWorkGroupInvocations)
3625 TCU_FAIL("Reported workgroup size limit in the axis is greater than the global invocation limit");
3626
3627 if (properties.limits.maxStorageBufferRange / static_cast<deUint32>(sizeof(deUint32)) < invocations)
3628 TCU_THROW(NotSupportedError, "Maximum supported storage buffer range too small");
3629 }
3630
MaxWorkGroupSizeInstance(Context & context,const MaxWorkGroupSizeTest::Params & params)3631 MaxWorkGroupSizeInstance::MaxWorkGroupSizeInstance (Context& context, const MaxWorkGroupSizeTest::Params& params)
3632 : vkt::TestInstance (context)
3633 , m_params (params)
3634 {}
3635
iterate(void)3636 tcu::TestStatus MaxWorkGroupSizeInstance::iterate (void)
3637 {
3638 const auto& vki = m_context.getInstanceInterface();
3639 const auto& vkd = m_context.getDeviceInterface();
3640 const auto physicalDevice = m_context.getPhysicalDevice();
3641 const auto device = m_context.getDevice();
3642 auto& alloc = m_context.getDefaultAllocator();
3643 const auto queueIndex = m_context.getUniversalQueueFamilyIndex();
3644 const auto queue = m_context.getUniversalQueue();
3645 auto& log = m_context.getTestContext().getLog();
3646
3647 const auto axis = MaxWorkGroupSizeTest::getIndex(m_params.axis);
3648 const auto invocations = MaxWorkGroupSizeTest::getInvocations(m_params, vki, physicalDevice);
3649 const auto ssboSize = static_cast<vk::VkDeviceSize>(MaxWorkGroupSizeTest::getSSBOSize(invocations));
3650
3651 log
3652 << tcu::TestLog::Message
3653 << "Running test with " << invocations << " invocations on axis " << axis << " using a storage buffer size of " << ssboSize << " bytes"
3654 << tcu::TestLog::EndMessage
3655 ;
3656
3657 // Main SSBO buffer.
3658 const auto ssboInfo = vk::makeBufferCreateInfo(ssboSize, vk::VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
3659 vk::BufferWithMemory ssbo (vkd, device, alloc, ssboInfo, vk::MemoryRequirement::HostVisible);
3660
3661 // Shader module.
3662 const auto shaderModule = vk::createShaderModule(vkd, device, m_context.getBinaryCollection().get("comp"), 0u);
3663
3664 // Descriptor set layouts.
3665 vk::DescriptorSetLayoutBuilder layoutBuilder;
3666 layoutBuilder.addSingleBinding(vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, vk::VK_SHADER_STAGE_COMPUTE_BIT);
3667 const auto descriptorSetLayout = layoutBuilder.build(vkd, device);
3668
3669 // Specialization constants: set the number of invocations in the appropriate local size id.
3670 const auto entrySize = static_cast<deUintptr>(sizeof(deInt32));
3671 deInt32 specializationData[3] = { 1, 1, 1 };
3672 specializationData[axis] = static_cast<deInt32>(invocations);
3673
3674 const vk::VkSpecializationMapEntry specializationMaps[3] =
3675 {
3676 {
3677 0u, // deUint32 constantID;
3678 0u, // deUint32 offset;
3679 entrySize, // deUintptr size;
3680 },
3681 {
3682 1u, // deUint32 constantID;
3683 static_cast<deUint32>(entrySize), // deUint32 offset;
3684 entrySize, // deUintptr size;
3685 },
3686 {
3687 2u, // deUint32 constantID;
3688 static_cast<deUint32>(entrySize * 2u), // deUint32 offset;
3689 entrySize, // deUintptr size;
3690 },
3691 };
3692
3693 const vk::VkSpecializationInfo specializationInfo =
3694 {
3695 3u, // deUint32 mapEntryCount;
3696 specializationMaps, // const VkSpecializationMapEntry* pMapEntries;
3697 static_cast<deUintptr>(sizeof(specializationData)), // deUintptr dataSize;
3698 specializationData, // const void* pData;
3699 };
3700
3701 // Test pipeline.
3702 const vk::VkPipelineLayoutCreateInfo testPipelineLayoutInfo =
3703 {
3704 vk::VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // VkStructureType sType;
3705 nullptr, // const void* pNext;
3706 0u, // VkPipelineLayoutCreateFlags flags;
3707 1u, // deUint32 setLayoutCount;
3708 &descriptorSetLayout.get(), // const VkDescriptorSetLayout* pSetLayouts;
3709 0u, // deUint32 pushConstantRangeCount;
3710 nullptr, // const VkPushConstantRange* pPushConstantRanges;
3711 };
3712 const auto testPipelineLayout = vk::createPipelineLayout(vkd, device, &testPipelineLayoutInfo);
3713
3714 const vk::VkComputePipelineCreateInfo testPipelineInfo =
3715 {
3716 vk::VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, // VkStructureType sType;
3717 nullptr, // const void* pNext;
3718 0u, // VkPipelineCreateFlags flags;
3719 { // VkPipelineShaderStageCreateInfo stage;
3720 vk::VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,// VkStructureType sType;
3721 nullptr, // const void* pNext;
3722 0u, // VkPipelineShaderStageCreateFlags flags;
3723 vk::VK_SHADER_STAGE_COMPUTE_BIT, // VkShaderStageFlagBits stage;
3724 shaderModule.get(), // VkShaderModule module;
3725 "main", // const char* pName;
3726 &specializationInfo, // const VkSpecializationInfo* pSpecializationInfo;
3727 },
3728 testPipelineLayout.get(), // VkPipelineLayout layout;
3729 DE_NULL, // VkPipeline basePipelineHandle;
3730 0u, // deInt32 basePipelineIndex;
3731 };
3732 const auto testPipeline = vk::createComputePipeline(vkd, device, DE_NULL, &testPipelineInfo);
3733
3734 // Create descriptor pool and set.
3735 vk::DescriptorPoolBuilder poolBuilder;
3736 poolBuilder.addType(vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
3737 const auto descriptorPool = poolBuilder.build(vkd, device, vk::VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u);
3738 const auto descriptorSet = vk::makeDescriptorSet(vkd, device, descriptorPool.get(), descriptorSetLayout.get());
3739
3740 // Update descriptor set.
3741 const vk::VkDescriptorBufferInfo ssboBufferInfo =
3742 {
3743 ssbo.get(), // VkBuffer buffer;
3744 0u, // VkDeviceSize offset;
3745 VK_WHOLE_SIZE, // VkDeviceSize range;
3746 };
3747
3748 vk::DescriptorSetUpdateBuilder updateBuilder;
3749 updateBuilder.writeSingle(descriptorSet.get(), vk::DescriptorSetUpdateBuilder::Location::binding(0u), vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &ssboBufferInfo);
3750 updateBuilder.update(vkd, device);
3751
3752 // Clear buffer.
3753 auto& ssboAlloc = ssbo.getAllocation();
3754 void* ssboPtr = ssboAlloc.getHostPtr();
3755 deMemset(ssboPtr, 0, static_cast<size_t>(ssboSize));
3756 vk::flushAlloc(vkd, device, ssboAlloc);
3757
3758 // Run pipelines.
3759 const auto cmdPool = vk::makeCommandPool(vkd, device, queueIndex);
3760 const auto cmdBUfferPtr = vk::allocateCommandBuffer(vkd, device, cmdPool.get(), vk::VK_COMMAND_BUFFER_LEVEL_PRIMARY);
3761 const auto cmdBuffer = cmdBUfferPtr.get();
3762
3763 vk::beginCommandBuffer(vkd, cmdBuffer);
3764
3765 // Run the main test shader.
3766 const auto hostToComputeBarrier = vk::makeBufferMemoryBarrier(vk::VK_ACCESS_HOST_WRITE_BIT, vk::VK_ACCESS_SHADER_WRITE_BIT, ssbo.get(), 0ull, VK_WHOLE_SIZE);
3767 vkd.cmdPipelineBarrier(cmdBuffer, vk::VK_PIPELINE_STAGE_HOST_BIT, vk::VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0u, 0u, nullptr, 1u, &hostToComputeBarrier, 0u, nullptr);
3768
3769 vkd.cmdBindPipeline(cmdBuffer, vk::VK_PIPELINE_BIND_POINT_COMPUTE, testPipeline.get());
3770 vkd.cmdBindDescriptorSets(cmdBuffer, vk::VK_PIPELINE_BIND_POINT_COMPUTE, testPipelineLayout.get(), 0u, 1u, &descriptorSet.get(), 0u, nullptr);
3771 vkd.cmdDispatch(cmdBuffer, 1u, 1u, 1u);
3772
3773 const auto computeToHostBarrier = vk::makeBufferMemoryBarrier(vk::VK_ACCESS_SHADER_WRITE_BIT, vk::VK_ACCESS_HOST_READ_BIT, ssbo.get(), 0ull, VK_WHOLE_SIZE);
3774 vkd.cmdPipelineBarrier(cmdBuffer, vk::VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, vk::VK_PIPELINE_STAGE_HOST_BIT, 0u, 0u, nullptr, 1u, &computeToHostBarrier, 0u, nullptr);
3775
3776 vk::endCommandBuffer(vkd, cmdBuffer);
3777 vk::submitCommandsAndWait(vkd, device, queue, cmdBuffer);
3778
3779 // Verify buffer contents.
3780 vk::invalidateAlloc(vkd, device, ssboAlloc);
3781 std::unique_ptr<deUint32[]> valuesArray (new deUint32[invocations]);
3782 deUint32* valuesPtr = valuesArray.get();
3783 deMemcpy(valuesPtr, ssboPtr, static_cast<size_t>(ssboSize));
3784
3785 std::string errorMsg;
3786 bool ok = true;
3787
3788 for (size_t i = 0; i < invocations; ++i)
3789 {
3790 if (valuesPtr[i] != 1u)
3791 {
3792 ok = false;
3793 errorMsg = "Found invalid value for invocation index " + de::toString(i) + ": expected 1u and found " + de::toString(valuesPtr[i]);
3794 break;
3795 }
3796 }
3797
3798 if (!ok)
3799 return tcu::TestStatus::fail(errorMsg);
3800 return tcu::TestStatus::pass("Pass");
3801 }
3802
3803 namespace EmptyShaderTest
3804 {
3805
createProgram(SourceCollections & dst)3806 void createProgram (SourceCollections& dst)
3807 {
3808 dst.glslSources.add("comp") << glu::ComputeSource(
3809 "#version 310 es\n"
3810 "layout (local_size_x = 1) in;\n"
3811 "void main (void) {}\n"
3812 );
3813 }
3814
createTest(Context & context)3815 tcu::TestStatus createTest (Context& context)
3816 {
3817 const DeviceInterface& vk = context.getDeviceInterface();
3818 const VkDevice device = context.getDevice();
3819 const VkQueue queue = context.getUniversalQueue();
3820 const deUint32 queueFamilyIndex = context.getUniversalQueueFamilyIndex();
3821
3822 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, context.getBinaryCollection().get("comp"), 0u));
3823
3824 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device));
3825 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
3826
3827 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
3828 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3829
3830 // Start recording commands
3831
3832 beginCommandBuffer(vk, *cmdBuffer);
3833
3834 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
3835
3836 const tcu::IVec3 workGroups(1, 1, 1);
3837 vk.cmdDispatch(*cmdBuffer, workGroups.x(), workGroups.y(), workGroups.z());
3838
3839 endCommandBuffer(vk, *cmdBuffer);
3840
3841 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
3842
3843 return tcu::TestStatus::pass("Compute succeeded");
3844 }
3845
3846 } // EmptyShaderTest ns
3847 } // anonymous
3848
createBasicComputeShaderTests(tcu::TestContext & testCtx)3849 tcu::TestCaseGroup* createBasicComputeShaderTests (tcu::TestContext& testCtx)
3850 {
3851 de::MovePtr<tcu::TestCaseGroup> basicComputeTests(new tcu::TestCaseGroup(testCtx, "basic", "Basic compute tests"));
3852
3853 addFunctionCaseWithPrograms(basicComputeTests.get(), "empty_shader", "Shader that does nothing", EmptyShaderTest::createProgram, EmptyShaderTest::createTest);
3854
3855 basicComputeTests->addChild(new ConcurrentCompute(testCtx, "concurrent_compute", "Concurrent compute test"));
3856
3857 basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_x", "Use an empty workgroup with size 0 on the X axis", tcu::UVec3(0u, 2u, 3u)));
3858 basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_y", "Use an empty workgroup with size 0 on the Y axis", tcu::UVec3(2u, 0u, 3u)));
3859 basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_z", "Use an empty workgroup with size 0 on the Z axis", tcu::UVec3(2u, 3u, 0u)));
3860 basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_all", "Use an empty workgroup with size 0 on the X, Y and Z axes", tcu::UVec3(0u, 0u, 0u)));
3861
3862 basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_x", "Use the maximum work group size on the X axis", MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::X}));
3863 basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_y", "Use the maximum work group size on the Y axis", MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::Y}));
3864 basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_z", "Use the maximum work group size on the Z axis", MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::Z}));
3865
3866 basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx, "ubo_to_ssbo_single_invocation", "Copy from UBO to SSBO, inverting bits", 256, tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3867 basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx, "ubo_to_ssbo_single_group", "Copy from UBO to SSBO, inverting bits", 1024, tcu::IVec3(2,1,4), tcu::IVec3(1,1,1)));
3868 basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx, "ubo_to_ssbo_multiple_invocations", "Copy from UBO to SSBO, inverting bits", 1024, tcu::IVec3(1,1,1), tcu::IVec3(2,4,1)));
3869 basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx, "ubo_to_ssbo_multiple_groups", "Copy from UBO to SSBO, inverting bits", 1024, tcu::IVec3(1,4,2), tcu::IVec3(2,2,4)));
3870
3871 basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx, "copy_ssbo_single_invocation", "Copy between SSBOs, inverting bits", 256, tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3872 basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx, "copy_ssbo_multiple_invocations", "Copy between SSBOs, inverting bits", 1024, tcu::IVec3(1,1,1), tcu::IVec3(2,4,1)));
3873 basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx, "copy_ssbo_multiple_groups", "Copy between SSBOs, inverting bits", 1024, tcu::IVec3(1,4,2), tcu::IVec3(2,2,4)));
3874
3875 basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx, "ssbo_rw_single_invocation", "Read and write same SSBO", 256, true, tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3876 basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx, "ssbo_rw_multiple_groups", "Read and write same SSBO", 1024, true, tcu::IVec3(1,4,2), tcu::IVec3(2,2,4)));
3877 basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx, "ssbo_unsized_arr_single_invocation", "Read and write same SSBO", 256, false, tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3878 basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx, "ssbo_unsized_arr_multiple_groups", "Read and write same SSBO", 1024, false, tcu::IVec3(1,4,2), tcu::IVec3(2,2,4)));
3879
3880 basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx, "write_multiple_arr_single_invocation", "Write to multiple SSBOs", 256, true, tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3881 basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx, "write_multiple_arr_multiple_groups", "Write to multiple SSBOs", 1024, true, tcu::IVec3(1,4,2), tcu::IVec3(2,2,4)));
3882 basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx, "write_multiple_unsized_arr_single_invocation", "Write to multiple SSBOs", 256, false, tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3883 basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx, "write_multiple_unsized_arr_multiple_groups", "Write to multiple SSBOs", 1024, false, tcu::IVec3(1,4,2), tcu::IVec3(2,2,4)));
3884
3885 basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx, "ssbo_local_barrier_single_invocation", "SSBO local barrier usage", tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3886 basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx, "ssbo_local_barrier_single_group", "SSBO local barrier usage", tcu::IVec3(3,2,5), tcu::IVec3(1,1,1)));
3887 basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx, "ssbo_local_barrier_multiple_groups", "SSBO local barrier usage", tcu::IVec3(3,4,1), tcu::IVec3(2,7,3)));
3888
3889 basicComputeTests->addChild(new SSBOBarrierTest(testCtx, "ssbo_cmd_barrier_single", "SSBO memory barrier usage", tcu::IVec3(1,1,1)));
3890 basicComputeTests->addChild(new SSBOBarrierTest(testCtx, "ssbo_cmd_barrier_multiple", "SSBO memory barrier usage", tcu::IVec3(11,5,7)));
3891
3892 basicComputeTests->addChild(new SharedVarTest(testCtx, "shared_var_single_invocation", "Basic shared variable usage", tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3893 basicComputeTests->addChild(new SharedVarTest(testCtx, "shared_var_single_group", "Basic shared variable usage", tcu::IVec3(3,2,5), tcu::IVec3(1,1,1)));
3894 basicComputeTests->addChild(new SharedVarTest(testCtx, "shared_var_multiple_invocations", "Basic shared variable usage", tcu::IVec3(1,1,1), tcu::IVec3(2,5,4)));
3895 basicComputeTests->addChild(new SharedVarTest(testCtx, "shared_var_multiple_groups", "Basic shared variable usage", tcu::IVec3(3,4,1), tcu::IVec3(2,7,3)));
3896
3897 basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx, "shared_atomic_op_single_invocation", "Atomic operation with shared var", tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3898 basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx, "shared_atomic_op_single_group", "Atomic operation with shared var", tcu::IVec3(3,2,5), tcu::IVec3(1,1,1)));
3899 basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx, "shared_atomic_op_multiple_invocations", "Atomic operation with shared var", tcu::IVec3(1,1,1), tcu::IVec3(2,5,4)));
3900 basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx, "shared_atomic_op_multiple_groups", "Atomic operation with shared var", tcu::IVec3(3,4,1), tcu::IVec3(2,7,3)));
3901
3902 basicComputeTests->addChild(new CopyImageToSSBOTest(testCtx, "copy_image_to_ssbo_small", "Image to SSBO copy", tcu::IVec2(1,1), tcu::IVec2(64,64)));
3903 basicComputeTests->addChild(new CopyImageToSSBOTest(testCtx, "copy_image_to_ssbo_large", "Image to SSBO copy", tcu::IVec2(2,4), tcu::IVec2(512,512)));
3904
3905 basicComputeTests->addChild(new CopySSBOToImageTest(testCtx, "copy_ssbo_to_image_small", "SSBO to image copy", tcu::IVec2(1, 1), tcu::IVec2(64, 64)));
3906 basicComputeTests->addChild(new CopySSBOToImageTest(testCtx, "copy_ssbo_to_image_large", "SSBO to image copy", tcu::IVec2(2, 4), tcu::IVec2(512, 512)));
3907
3908 basicComputeTests->addChild(new ImageAtomicOpTest(testCtx, "image_atomic_op_local_size_1", "Atomic operation with image", 1, tcu::IVec2(64,64)));
3909 basicComputeTests->addChild(new ImageAtomicOpTest(testCtx, "image_atomic_op_local_size_8", "Atomic operation with image", 8, tcu::IVec2(64,64)));
3910
3911 basicComputeTests->addChild(new ImageBarrierTest(testCtx, "image_barrier_single", "Image barrier", tcu::IVec2(1,1)));
3912 basicComputeTests->addChild(new ImageBarrierTest(testCtx, "image_barrier_multiple", "Image barrier", tcu::IVec2(64,64)));
3913
3914 #ifndef CTS_USES_VULKANSC
3915 basicComputeTests->addChild(cts_amber::createAmberTestCase(testCtx, "write_ssbo_array", "", "compute", "write_ssbo_array.amber"));
3916 basicComputeTests->addChild(cts_amber::createAmberTestCase(testCtx, "branch_past_barrier", "", "compute", "branch_past_barrier.amber"));
3917 #endif
3918
3919 return basicComputeTests.release();
3920 }
3921
createBasicDeviceGroupComputeShaderTests(tcu::TestContext & testCtx)3922 tcu::TestCaseGroup* createBasicDeviceGroupComputeShaderTests (tcu::TestContext& testCtx)
3923 {
3924 de::MovePtr<tcu::TestCaseGroup> deviceGroupComputeTests(new tcu::TestCaseGroup(testCtx, "device_group", "Basic device group compute tests"));
3925
3926 deviceGroupComputeTests->addChild(new DispatchBaseTest(testCtx, "dispatch_base", "Compute shader with base groups", 32768, tcu::IVec3(4,2,4), tcu::IVec3(16,8,8), tcu::IVec3(4,8,8)));
3927 deviceGroupComputeTests->addChild(new DeviceIndexTest(testCtx, "device_index", "Compute shader using deviceIndex in SPIRV", 96, tcu::IVec3(3,2,1), tcu::IVec3(2,4,1)));
3928
3929 return deviceGroupComputeTests.release();
3930
3931 }
3932 } // compute
3933 } // vkt
3934