1 /*------------------------------------------------------------------------
2 * Vulkan Conformance Tests
3 * ------------------------
4 *
5 * Copyright (c) 2019 The Khronos Group Inc.
6 * Copyright (c) 2019 The Android Open Source Project
7 *
8 * Licensed under the Apache License, Version 2.0 (the "License");
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
11 *
12 * http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
19 *
20 *//*!
21 * \file
22 * \brief Compute Shader Tests
23 *//*--------------------------------------------------------------------*/
24
25 #include "vktComputeBasicComputeShaderTests.hpp"
26 #include "vktTestCase.hpp"
27 #include "vktTestCaseUtil.hpp"
28 #include "vktComputeTestsUtil.hpp"
29 #include "vktCustomInstancesDevices.hpp"
30 #include "vktAmberTestCase.hpp"
31
32 #include "vkDefs.hpp"
33 #include "vkRef.hpp"
34 #include "vkRefUtil.hpp"
35 #include "vkPlatform.hpp"
36 #include "vkPrograms.hpp"
37 #include "vkRefUtil.hpp"
38 #include "vkMemUtil.hpp"
39 #include "vkBarrierUtil.hpp"
40 #include "vkQueryUtil.hpp"
41 #include "vkBuilderUtil.hpp"
42 #include "vkTypeUtil.hpp"
43 #include "vkDeviceUtil.hpp"
44 #include "vkCmdUtil.hpp"
45 #include "vkObjUtil.hpp"
46 #include "vkBufferWithMemory.hpp"
47
48 #include "tcuCommandLine.hpp"
49 #include "tcuTestLog.hpp"
50
51 #include "deStringUtil.hpp"
52 #include "deUniquePtr.hpp"
53 #include "deRandom.hpp"
54
55 #include <vector>
56 #include <memory>
57
58 using namespace vk;
59
60 namespace vkt
61 {
62 namespace compute
63 {
64 namespace
65 {
66
67 template<typename T, int size>
multiplyComponents(const tcu::Vector<T,size> & v)68 T multiplyComponents (const tcu::Vector<T, size>& v)
69 {
70 T accum = 1;
71 for (int i = 0; i < size; ++i)
72 accum *= v[i];
73 return accum;
74 }
75
76 template<typename T>
squared(const T & a)77 inline T squared (const T& a)
78 {
79 return a * a;
80 }
81
make2DImageCreateInfo(const tcu::IVec2 & imageSize,const VkImageUsageFlags usage)82 inline VkImageCreateInfo make2DImageCreateInfo (const tcu::IVec2& imageSize, const VkImageUsageFlags usage)
83 {
84 const VkImageCreateInfo imageParams =
85 {
86 VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // VkStructureType sType;
87 DE_NULL, // const void* pNext;
88 0u, // VkImageCreateFlags flags;
89 VK_IMAGE_TYPE_2D, // VkImageType imageType;
90 VK_FORMAT_R32_UINT, // VkFormat format;
91 vk::makeExtent3D(imageSize.x(), imageSize.y(), 1), // VkExtent3D extent;
92 1u, // deUint32 mipLevels;
93 1u, // deUint32 arrayLayers;
94 VK_SAMPLE_COUNT_1_BIT, // VkSampleCountFlagBits samples;
95 VK_IMAGE_TILING_OPTIMAL, // VkImageTiling tiling;
96 usage, // VkImageUsageFlags usage;
97 VK_SHARING_MODE_EXCLUSIVE, // VkSharingMode sharingMode;
98 0u, // deUint32 queueFamilyIndexCount;
99 DE_NULL, // const deUint32* pQueueFamilyIndices;
100 VK_IMAGE_LAYOUT_UNDEFINED, // VkImageLayout initialLayout;
101 };
102 return imageParams;
103 }
104
makeBufferImageCopy(const tcu::IVec2 & imageSize)105 inline VkBufferImageCopy makeBufferImageCopy(const tcu::IVec2& imageSize)
106 {
107 return compute::makeBufferImageCopy(vk::makeExtent3D(imageSize.x(), imageSize.y(), 1), 1u);
108 }
109
110 enum BufferType
111 {
112 BUFFER_TYPE_UNIFORM,
113 BUFFER_TYPE_SSBO,
114 };
115
116 class SharedVarTest : public vkt::TestCase
117 {
118 public:
119 SharedVarTest (tcu::TestContext& testCtx,
120 const std::string& name,
121 const std::string& description,
122 const tcu::IVec3& localSize,
123 const tcu::IVec3& workSize);
124
125 void initPrograms (SourceCollections& sourceCollections) const;
126 TestInstance* createInstance (Context& context) const;
127
128 private:
129 const tcu::IVec3 m_localSize;
130 const tcu::IVec3 m_workSize;
131 };
132
133 class SharedVarTestInstance : public vkt::TestInstance
134 {
135 public:
136 SharedVarTestInstance (Context& context,
137 const tcu::IVec3& localSize,
138 const tcu::IVec3& workSize);
139
140 tcu::TestStatus iterate (void);
141
142 private:
143 const tcu::IVec3 m_localSize;
144 const tcu::IVec3 m_workSize;
145 };
146
SharedVarTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)147 SharedVarTest::SharedVarTest (tcu::TestContext& testCtx,
148 const std::string& name,
149 const std::string& description,
150 const tcu::IVec3& localSize,
151 const tcu::IVec3& workSize)
152 : TestCase (testCtx, name, description)
153 , m_localSize (localSize)
154 , m_workSize (workSize)
155 {
156 }
157
initPrograms(SourceCollections & sourceCollections) const158 void SharedVarTest::initPrograms (SourceCollections& sourceCollections) const
159 {
160 const int workGroupSize = multiplyComponents(m_localSize);
161 const int workGroupCount = multiplyComponents(m_workSize);
162 const int numValues = workGroupSize * workGroupCount;
163
164 std::ostringstream src;
165 src << "#version 310 es\n"
166 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
167 << "layout(binding = 0) writeonly buffer Output {\n"
168 << " uint values[" << numValues << "];\n"
169 << "} sb_out;\n\n"
170 << "shared uint offsets[" << workGroupSize << "];\n\n"
171 << "void main (void) {\n"
172 << " uint localSize = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
173 << " uint globalNdx = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
174 << " uint globalOffs = localSize*globalNdx;\n"
175 << " uint localOffs = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_LocalInvocationID.z + gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x;\n"
176 << "\n"
177 << " offsets[localSize-localOffs-1u] = globalOffs + localOffs*localOffs;\n"
178 << " memoryBarrierShared();\n"
179 << " barrier();\n"
180 << " sb_out.values[globalOffs + localOffs] = offsets[localOffs];\n"
181 << "}\n";
182
183 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
184 }
185
createInstance(Context & context) const186 TestInstance* SharedVarTest::createInstance (Context& context) const
187 {
188 return new SharedVarTestInstance(context, m_localSize, m_workSize);
189 }
190
SharedVarTestInstance(Context & context,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)191 SharedVarTestInstance::SharedVarTestInstance (Context& context, const tcu::IVec3& localSize, const tcu::IVec3& workSize)
192 : TestInstance (context)
193 , m_localSize (localSize)
194 , m_workSize (workSize)
195 {
196 }
197
iterate(void)198 tcu::TestStatus SharedVarTestInstance::iterate (void)
199 {
200 const DeviceInterface& vk = m_context.getDeviceInterface();
201 const VkDevice device = m_context.getDevice();
202 const VkQueue queue = m_context.getUniversalQueue();
203 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
204 Allocator& allocator = m_context.getDefaultAllocator();
205
206 const int workGroupSize = multiplyComponents(m_localSize);
207 const int workGroupCount = multiplyComponents(m_workSize);
208
209 // Create a buffer and host-visible memory for it
210
211 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * workGroupSize * workGroupCount;
212 const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
213
214 // Create descriptor set
215
216 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
217 DescriptorSetLayoutBuilder()
218 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
219 .build(vk, device));
220
221 const Unique<VkDescriptorPool> descriptorPool(
222 DescriptorPoolBuilder()
223 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
224 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
225
226 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
227
228 const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
229 DescriptorSetUpdateBuilder()
230 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
231 .update(vk, device);
232
233 // Perform the computation
234
235 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
236 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
237 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
238
239 const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
240
241 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
242 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
243
244 // Start recording commands
245
246 beginCommandBuffer(vk, *cmdBuffer);
247
248 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
249 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
250
251 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
252
253 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
254
255 endCommandBuffer(vk, *cmdBuffer);
256
257 // Wait for completion
258
259 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
260
261 // Validate the results
262
263 const Allocation& bufferAllocation = buffer.getAllocation();
264 invalidateAlloc(vk, device, bufferAllocation);
265
266 const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
267
268 for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
269 {
270 const int globalOffset = groupNdx * workGroupSize;
271 for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
272 {
273 const deUint32 res = bufferPtr[globalOffset + localOffset];
274 const deUint32 ref = globalOffset + squared(workGroupSize - localOffset - 1);
275
276 if (res != ref)
277 {
278 std::ostringstream msg;
279 msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
280 return tcu::TestStatus::fail(msg.str());
281 }
282 }
283 }
284 return tcu::TestStatus::pass("Compute succeeded");
285 }
286
287 class SharedVarAtomicOpTest : public vkt::TestCase
288 {
289 public:
290 SharedVarAtomicOpTest (tcu::TestContext& testCtx,
291 const std::string& name,
292 const std::string& description,
293 const tcu::IVec3& localSize,
294 const tcu::IVec3& workSize);
295
296 void initPrograms (SourceCollections& sourceCollections) const;
297 TestInstance* createInstance (Context& context) const;
298
299 private:
300 const tcu::IVec3 m_localSize;
301 const tcu::IVec3 m_workSize;
302 };
303
304 class SharedVarAtomicOpTestInstance : public vkt::TestInstance
305 {
306 public:
307 SharedVarAtomicOpTestInstance (Context& context,
308 const tcu::IVec3& localSize,
309 const tcu::IVec3& workSize);
310
311 tcu::TestStatus iterate (void);
312
313 private:
314 const tcu::IVec3 m_localSize;
315 const tcu::IVec3 m_workSize;
316 };
317
SharedVarAtomicOpTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)318 SharedVarAtomicOpTest::SharedVarAtomicOpTest (tcu::TestContext& testCtx,
319 const std::string& name,
320 const std::string& description,
321 const tcu::IVec3& localSize,
322 const tcu::IVec3& workSize)
323 : TestCase (testCtx, name, description)
324 , m_localSize (localSize)
325 , m_workSize (workSize)
326 {
327 }
328
initPrograms(SourceCollections & sourceCollections) const329 void SharedVarAtomicOpTest::initPrograms (SourceCollections& sourceCollections) const
330 {
331 const int workGroupSize = multiplyComponents(m_localSize);
332 const int workGroupCount = multiplyComponents(m_workSize);
333 const int numValues = workGroupSize * workGroupCount;
334
335 std::ostringstream src;
336 src << "#version 310 es\n"
337 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
338 << "layout(binding = 0) writeonly buffer Output {\n"
339 << " uint values[" << numValues << "];\n"
340 << "} sb_out;\n\n"
341 << "shared uint count;\n\n"
342 << "void main (void) {\n"
343 << " uint localSize = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
344 << " uint globalNdx = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
345 << " uint globalOffs = localSize*globalNdx;\n"
346 << "\n"
347 << " count = 0u;\n"
348 << " memoryBarrierShared();\n"
349 << " barrier();\n"
350 << " uint oldVal = atomicAdd(count, 1u);\n"
351 << " sb_out.values[globalOffs+oldVal] = oldVal+1u;\n"
352 << "}\n";
353
354 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
355 }
356
createInstance(Context & context) const357 TestInstance* SharedVarAtomicOpTest::createInstance (Context& context) const
358 {
359 return new SharedVarAtomicOpTestInstance(context, m_localSize, m_workSize);
360 }
361
SharedVarAtomicOpTestInstance(Context & context,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)362 SharedVarAtomicOpTestInstance::SharedVarAtomicOpTestInstance (Context& context, const tcu::IVec3& localSize, const tcu::IVec3& workSize)
363 : TestInstance (context)
364 , m_localSize (localSize)
365 , m_workSize (workSize)
366 {
367 }
368
iterate(void)369 tcu::TestStatus SharedVarAtomicOpTestInstance::iterate (void)
370 {
371 const DeviceInterface& vk = m_context.getDeviceInterface();
372 const VkDevice device = m_context.getDevice();
373 const VkQueue queue = m_context.getUniversalQueue();
374 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
375 Allocator& allocator = m_context.getDefaultAllocator();
376
377 const int workGroupSize = multiplyComponents(m_localSize);
378 const int workGroupCount = multiplyComponents(m_workSize);
379
380 // Create a buffer and host-visible memory for it
381
382 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * workGroupSize * workGroupCount;
383 const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
384
385 // Create descriptor set
386
387 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
388 DescriptorSetLayoutBuilder()
389 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
390 .build(vk, device));
391
392 const Unique<VkDescriptorPool> descriptorPool(
393 DescriptorPoolBuilder()
394 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
395 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
396
397 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
398
399 const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
400 DescriptorSetUpdateBuilder()
401 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
402 .update(vk, device);
403
404 // Perform the computation
405
406 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
407 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
408 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
409
410 const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
411
412 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
413 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
414
415 // Start recording commands
416
417 beginCommandBuffer(vk, *cmdBuffer);
418
419 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
420 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
421
422 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
423
424 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1u, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
425
426 endCommandBuffer(vk, *cmdBuffer);
427
428 // Wait for completion
429
430 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
431
432 // Validate the results
433
434 const Allocation& bufferAllocation = buffer.getAllocation();
435 invalidateAlloc(vk, device, bufferAllocation);
436
437 const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
438
439 for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
440 {
441 const int globalOffset = groupNdx * workGroupSize;
442 for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
443 {
444 const deUint32 res = bufferPtr[globalOffset + localOffset];
445 const deUint32 ref = localOffset + 1;
446
447 if (res != ref)
448 {
449 std::ostringstream msg;
450 msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
451 return tcu::TestStatus::fail(msg.str());
452 }
453 }
454 }
455 return tcu::TestStatus::pass("Compute succeeded");
456 }
457
458 class SSBOLocalBarrierTest : public vkt::TestCase
459 {
460 public:
461 SSBOLocalBarrierTest (tcu::TestContext& testCtx,
462 const std::string& name,
463 const std::string& description,
464 const tcu::IVec3& localSize,
465 const tcu::IVec3& workSize);
466
467 void initPrograms (SourceCollections& sourceCollections) const;
468 TestInstance* createInstance (Context& context) const;
469
470 private:
471 const tcu::IVec3 m_localSize;
472 const tcu::IVec3 m_workSize;
473 };
474
475 class SSBOLocalBarrierTestInstance : public vkt::TestInstance
476 {
477 public:
478 SSBOLocalBarrierTestInstance (Context& context,
479 const tcu::IVec3& localSize,
480 const tcu::IVec3& workSize);
481
482 tcu::TestStatus iterate (void);
483
484 private:
485 const tcu::IVec3 m_localSize;
486 const tcu::IVec3 m_workSize;
487 };
488
SSBOLocalBarrierTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)489 SSBOLocalBarrierTest::SSBOLocalBarrierTest (tcu::TestContext& testCtx,
490 const std::string& name,
491 const std::string& description,
492 const tcu::IVec3& localSize,
493 const tcu::IVec3& workSize)
494 : TestCase (testCtx, name, description)
495 , m_localSize (localSize)
496 , m_workSize (workSize)
497 {
498 }
499
initPrograms(SourceCollections & sourceCollections) const500 void SSBOLocalBarrierTest::initPrograms (SourceCollections& sourceCollections) const
501 {
502 const int workGroupSize = multiplyComponents(m_localSize);
503 const int workGroupCount = multiplyComponents(m_workSize);
504 const int numValues = workGroupSize * workGroupCount;
505
506 std::ostringstream src;
507 src << "#version 310 es\n"
508 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
509 << "layout(binding = 0) coherent buffer Output {\n"
510 << " uint values[" << numValues << "];\n"
511 << "} sb_out;\n\n"
512 << "void main (void) {\n"
513 << " uint localSize = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
514 << " uint globalNdx = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
515 << " uint globalOffs = localSize*globalNdx;\n"
516 << " uint localOffs = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_LocalInvocationID.z + gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x;\n"
517 << "\n"
518 << " sb_out.values[globalOffs + localOffs] = globalOffs;\n"
519 << " memoryBarrierBuffer();\n"
520 << " barrier();\n"
521 << " sb_out.values[globalOffs + ((localOffs+1u)%localSize)] += localOffs;\n" // += so we read and write
522 << " memoryBarrierBuffer();\n"
523 << " barrier();\n"
524 << " sb_out.values[globalOffs + ((localOffs+2u)%localSize)] += localOffs;\n"
525 << "}\n";
526
527 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
528 }
529
createInstance(Context & context) const530 TestInstance* SSBOLocalBarrierTest::createInstance (Context& context) const
531 {
532 return new SSBOLocalBarrierTestInstance(context, m_localSize, m_workSize);
533 }
534
SSBOLocalBarrierTestInstance(Context & context,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)535 SSBOLocalBarrierTestInstance::SSBOLocalBarrierTestInstance (Context& context, const tcu::IVec3& localSize, const tcu::IVec3& workSize)
536 : TestInstance (context)
537 , m_localSize (localSize)
538 , m_workSize (workSize)
539 {
540 }
541
iterate(void)542 tcu::TestStatus SSBOLocalBarrierTestInstance::iterate (void)
543 {
544 const DeviceInterface& vk = m_context.getDeviceInterface();
545 const VkDevice device = m_context.getDevice();
546 const VkQueue queue = m_context.getUniversalQueue();
547 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
548 Allocator& allocator = m_context.getDefaultAllocator();
549
550 const int workGroupSize = multiplyComponents(m_localSize);
551 const int workGroupCount = multiplyComponents(m_workSize);
552
553 // Create a buffer and host-visible memory for it
554
555 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * workGroupSize * workGroupCount;
556 const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
557
558 // Create descriptor set
559
560 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
561 DescriptorSetLayoutBuilder()
562 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
563 .build(vk, device));
564
565 const Unique<VkDescriptorPool> descriptorPool(
566 DescriptorPoolBuilder()
567 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
568 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
569
570 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
571
572 const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
573 DescriptorSetUpdateBuilder()
574 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
575 .update(vk, device);
576
577 // Perform the computation
578
579 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
580 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
581 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
582
583 const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
584
585 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
586 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
587
588 // Start recording commands
589
590 beginCommandBuffer(vk, *cmdBuffer);
591
592 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
593 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
594
595 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
596
597 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
598
599 endCommandBuffer(vk, *cmdBuffer);
600
601 // Wait for completion
602
603 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
604
605 // Validate the results
606
607 const Allocation& bufferAllocation = buffer.getAllocation();
608 invalidateAlloc(vk, device, bufferAllocation);
609
610 const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
611
612 for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
613 {
614 const int globalOffset = groupNdx * workGroupSize;
615 for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
616 {
617 const deUint32 res = bufferPtr[globalOffset + localOffset];
618 const int offs0 = localOffset - 1 < 0 ? ((localOffset + workGroupSize - 1) % workGroupSize) : ((localOffset - 1) % workGroupSize);
619 const int offs1 = localOffset - 2 < 0 ? ((localOffset + workGroupSize - 2) % workGroupSize) : ((localOffset - 2) % workGroupSize);
620 const deUint32 ref = static_cast<deUint32>(globalOffset + offs0 + offs1);
621
622 if (res != ref)
623 {
624 std::ostringstream msg;
625 msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
626 return tcu::TestStatus::fail(msg.str());
627 }
628 }
629 }
630 return tcu::TestStatus::pass("Compute succeeded");
631 }
632
633 class CopyImageToSSBOTest : public vkt::TestCase
634 {
635 public:
636 CopyImageToSSBOTest (tcu::TestContext& testCtx,
637 const std::string& name,
638 const std::string& description,
639 const tcu::IVec2& localSize,
640 const tcu::IVec2& imageSize);
641
642 void initPrograms (SourceCollections& sourceCollections) const;
643 TestInstance* createInstance (Context& context) const;
644
645 private:
646 const tcu::IVec2 m_localSize;
647 const tcu::IVec2 m_imageSize;
648 };
649
650 class CopyImageToSSBOTestInstance : public vkt::TestInstance
651 {
652 public:
653 CopyImageToSSBOTestInstance (Context& context,
654 const tcu::IVec2& localSize,
655 const tcu::IVec2& imageSize);
656
657 tcu::TestStatus iterate (void);
658
659 private:
660 const tcu::IVec2 m_localSize;
661 const tcu::IVec2 m_imageSize;
662 };
663
CopyImageToSSBOTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize)664 CopyImageToSSBOTest::CopyImageToSSBOTest (tcu::TestContext& testCtx,
665 const std::string& name,
666 const std::string& description,
667 const tcu::IVec2& localSize,
668 const tcu::IVec2& imageSize)
669 : TestCase (testCtx, name, description)
670 , m_localSize (localSize)
671 , m_imageSize (imageSize)
672 {
673 DE_ASSERT(m_imageSize.x() % m_localSize.x() == 0);
674 DE_ASSERT(m_imageSize.y() % m_localSize.y() == 0);
675 }
676
initPrograms(SourceCollections & sourceCollections) const677 void CopyImageToSSBOTest::initPrograms (SourceCollections& sourceCollections) const
678 {
679 std::ostringstream src;
680 src << "#version 310 es\n"
681 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ") in;\n"
682 << "layout(binding = 1, r32ui) readonly uniform highp uimage2D u_srcImg;\n"
683 << "layout(binding = 0) writeonly buffer Output {\n"
684 << " uint values[" << (m_imageSize.x() * m_imageSize.y()) << "];\n"
685 << "} sb_out;\n\n"
686 << "void main (void) {\n"
687 << " uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
688 << " uint value = imageLoad(u_srcImg, ivec2(gl_GlobalInvocationID.xy)).x;\n"
689 << " sb_out.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x] = value;\n"
690 << "}\n";
691
692 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
693 }
694
createInstance(Context & context) const695 TestInstance* CopyImageToSSBOTest::createInstance (Context& context) const
696 {
697 return new CopyImageToSSBOTestInstance(context, m_localSize, m_imageSize);
698 }
699
CopyImageToSSBOTestInstance(Context & context,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize)700 CopyImageToSSBOTestInstance::CopyImageToSSBOTestInstance (Context& context, const tcu::IVec2& localSize, const tcu::IVec2& imageSize)
701 : TestInstance (context)
702 , m_localSize (localSize)
703 , m_imageSize (imageSize)
704 {
705 }
706
iterate(void)707 tcu::TestStatus CopyImageToSSBOTestInstance::iterate (void)
708 {
709 const DeviceInterface& vk = m_context.getDeviceInterface();
710 const VkDevice device = m_context.getDevice();
711 const VkQueue queue = m_context.getUniversalQueue();
712 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
713 Allocator& allocator = m_context.getDefaultAllocator();
714
715 // Create an image
716
717 const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
718 const Image image(vk, device, allocator, imageParams, MemoryRequirement::Any);
719
720 const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
721 const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
722
723 // Staging buffer (source data for image)
724
725 const deUint32 imageArea = multiplyComponents(m_imageSize);
726 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * imageArea;
727
728 const Buffer stagingBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_SRC_BIT), MemoryRequirement::HostVisible);
729
730 // Populate the staging buffer with test data
731 {
732 de::Random rnd(0xab2c7);
733 const Allocation& stagingBufferAllocation = stagingBuffer.getAllocation();
734 deUint32* bufferPtr = static_cast<deUint32*>(stagingBufferAllocation.getHostPtr());
735 for (deUint32 i = 0; i < imageArea; ++i)
736 *bufferPtr++ = rnd.getUint32();
737
738 flushAlloc(vk, device, stagingBufferAllocation);
739 }
740
741 // Create a buffer to store shader output
742
743 const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
744
745 // Create descriptor set
746
747 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
748 DescriptorSetLayoutBuilder()
749 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
750 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
751 .build(vk, device));
752
753 const Unique<VkDescriptorPool> descriptorPool(
754 DescriptorPoolBuilder()
755 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
756 .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
757 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
758
759 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
760
761 // Set the bindings
762
763 const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
764 const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, bufferSizeBytes);
765
766 DescriptorSetUpdateBuilder()
767 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
768 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
769 .update(vk, device);
770
771 // Perform the computation
772 {
773 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
774 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
775 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
776
777 const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, bufferSizeBytes);
778 const tcu::IVec2 workSize = m_imageSize / m_localSize;
779
780 // Prepare the command buffer
781
782 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
783 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
784
785 // Start recording commands
786
787 beginCommandBuffer(vk, *cmdBuffer);
788
789 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
790 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
791
792 const std::vector<VkBufferImageCopy> bufferImageCopy(1, makeBufferImageCopy(m_imageSize));
793 copyBufferToImage(vk, *cmdBuffer, *stagingBuffer, bufferSizeBytes, bufferImageCopy, VK_IMAGE_ASPECT_COLOR_BIT, 1, 1, *image, VK_IMAGE_LAYOUT_GENERAL, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
794
795 vk.cmdDispatch(*cmdBuffer, workSize.x(), workSize.y(), 1u);
796 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
797
798 endCommandBuffer(vk, *cmdBuffer);
799
800 // Wait for completion
801
802 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
803 }
804
805 // Validate the results
806
807 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
808 invalidateAlloc(vk, device, outputBufferAllocation);
809
810 const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
811 const deUint32* refBufferPtr = static_cast<deUint32*>(stagingBuffer.getAllocation().getHostPtr());
812
813 for (deUint32 ndx = 0; ndx < imageArea; ++ndx)
814 {
815 const deUint32 res = *(bufferPtr + ndx);
816 const deUint32 ref = *(refBufferPtr + ndx);
817
818 if (res != ref)
819 {
820 std::ostringstream msg;
821 msg << "Comparison failed for Output.values[" << ndx << "]";
822 return tcu::TestStatus::fail(msg.str());
823 }
824 }
825 return tcu::TestStatus::pass("Compute succeeded");
826 }
827
828 class CopySSBOToImageTest : public vkt::TestCase
829 {
830 public:
831 CopySSBOToImageTest (tcu::TestContext& testCtx,
832 const std::string& name,
833 const std::string& description,
834 const tcu::IVec2& localSize,
835 const tcu::IVec2& imageSize);
836
837 void initPrograms (SourceCollections& sourceCollections) const;
838 TestInstance* createInstance (Context& context) const;
839
840 private:
841 const tcu::IVec2 m_localSize;
842 const tcu::IVec2 m_imageSize;
843 };
844
845 class CopySSBOToImageTestInstance : public vkt::TestInstance
846 {
847 public:
848 CopySSBOToImageTestInstance (Context& context,
849 const tcu::IVec2& localSize,
850 const tcu::IVec2& imageSize);
851
852 tcu::TestStatus iterate (void);
853
854 private:
855 const tcu::IVec2 m_localSize;
856 const tcu::IVec2 m_imageSize;
857 };
858
CopySSBOToImageTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize)859 CopySSBOToImageTest::CopySSBOToImageTest (tcu::TestContext& testCtx,
860 const std::string& name,
861 const std::string& description,
862 const tcu::IVec2& localSize,
863 const tcu::IVec2& imageSize)
864 : TestCase (testCtx, name, description)
865 , m_localSize (localSize)
866 , m_imageSize (imageSize)
867 {
868 DE_ASSERT(m_imageSize.x() % m_localSize.x() == 0);
869 DE_ASSERT(m_imageSize.y() % m_localSize.y() == 0);
870 }
871
initPrograms(SourceCollections & sourceCollections) const872 void CopySSBOToImageTest::initPrograms (SourceCollections& sourceCollections) const
873 {
874 std::ostringstream src;
875 src << "#version 310 es\n"
876 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ") in;\n"
877 << "layout(binding = 1, r32ui) writeonly uniform highp uimage2D u_dstImg;\n"
878 << "layout(binding = 0) readonly buffer Input {\n"
879 << " uint values[" << (m_imageSize.x() * m_imageSize.y()) << "];\n"
880 << "} sb_in;\n\n"
881 << "void main (void) {\n"
882 << " uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
883 << " uint value = sb_in.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x];\n"
884 << " imageStore(u_dstImg, ivec2(gl_GlobalInvocationID.xy), uvec4(value, 0, 0, 0));\n"
885 << "}\n";
886
887 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
888 }
889
createInstance(Context & context) const890 TestInstance* CopySSBOToImageTest::createInstance (Context& context) const
891 {
892 return new CopySSBOToImageTestInstance(context, m_localSize, m_imageSize);
893 }
894
CopySSBOToImageTestInstance(Context & context,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize)895 CopySSBOToImageTestInstance::CopySSBOToImageTestInstance (Context& context, const tcu::IVec2& localSize, const tcu::IVec2& imageSize)
896 : TestInstance (context)
897 , m_localSize (localSize)
898 , m_imageSize (imageSize)
899 {
900 }
901
iterate(void)902 tcu::TestStatus CopySSBOToImageTestInstance::iterate (void)
903 {
904 const DeviceInterface& vk = m_context.getDeviceInterface();
905 const VkDevice device = m_context.getDevice();
906 const VkQueue queue = m_context.getUniversalQueue();
907 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
908 Allocator& allocator = m_context.getDefaultAllocator();
909
910 // Create an image
911
912 const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
913 const Image image(vk, device, allocator, imageParams, MemoryRequirement::Any);
914
915 const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
916 const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
917
918 // Create an input buffer (data to be read in the shader)
919
920 const deUint32 imageArea = multiplyComponents(m_imageSize);
921 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * imageArea;
922
923 const Buffer inputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
924
925 // Populate the buffer with test data
926 {
927 de::Random rnd(0x77238ac2);
928 const Allocation& inputBufferAllocation = inputBuffer.getAllocation();
929 deUint32* bufferPtr = static_cast<deUint32*>(inputBufferAllocation.getHostPtr());
930 for (deUint32 i = 0; i < imageArea; ++i)
931 *bufferPtr++ = rnd.getUint32();
932
933 flushAlloc(vk, device, inputBufferAllocation);
934 }
935
936 // Create a buffer to store shader output (copied from image data)
937
938 const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT), MemoryRequirement::HostVisible);
939
940 // Create descriptor set
941
942 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
943 DescriptorSetLayoutBuilder()
944 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
945 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
946 .build(vk, device));
947
948 const Unique<VkDescriptorPool> descriptorPool(
949 DescriptorPoolBuilder()
950 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
951 .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
952 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
953
954 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
955
956 // Set the bindings
957
958 const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
959 const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, bufferSizeBytes);
960
961 DescriptorSetUpdateBuilder()
962 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
963 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
964 .update(vk, device);
965
966 // Perform the computation
967 {
968 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
969 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
970 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
971
972 const VkBufferMemoryBarrier inputBufferPostHostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, bufferSizeBytes);
973
974 const VkImageMemoryBarrier imageLayoutBarrier = makeImageMemoryBarrier(
975 0u, VK_ACCESS_SHADER_WRITE_BIT,
976 VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
977 *image, subresourceRange);
978
979 const tcu::IVec2 workSize = m_imageSize / m_localSize;
980
981 // Prepare the command buffer
982
983 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
984 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
985
986 // Start recording commands
987
988 beginCommandBuffer(vk, *cmdBuffer);
989
990 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
991 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
992
993 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &inputBufferPostHostWriteBarrier, 1, &imageLayoutBarrier);
994 vk.cmdDispatch(*cmdBuffer, workSize.x(), workSize.y(), 1u);
995
996 copyImageToBuffer(vk, *cmdBuffer, *image, *outputBuffer, m_imageSize, VK_ACCESS_SHADER_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL);
997
998 endCommandBuffer(vk, *cmdBuffer);
999
1000 // Wait for completion
1001
1002 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1003 }
1004
1005 // Validate the results
1006
1007 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1008 invalidateAlloc(vk, device, outputBufferAllocation);
1009
1010 const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
1011 const deUint32* refBufferPtr = static_cast<deUint32*>(inputBuffer.getAllocation().getHostPtr());
1012
1013 for (deUint32 ndx = 0; ndx < imageArea; ++ndx)
1014 {
1015 const deUint32 res = *(bufferPtr + ndx);
1016 const deUint32 ref = *(refBufferPtr + ndx);
1017
1018 if (res != ref)
1019 {
1020 std::ostringstream msg;
1021 msg << "Comparison failed for pixel " << ndx;
1022 return tcu::TestStatus::fail(msg.str());
1023 }
1024 }
1025 return tcu::TestStatus::pass("Compute succeeded");
1026 }
1027
1028 class BufferToBufferInvertTest : public vkt::TestCase
1029 {
1030 public:
1031 void initPrograms (SourceCollections& sourceCollections) const;
1032 TestInstance* createInstance (Context& context) const;
1033
1034 static BufferToBufferInvertTest* UBOToSSBOInvertCase (tcu::TestContext& testCtx,
1035 const std::string& name,
1036 const std::string& description,
1037 const deUint32 numValues,
1038 const tcu::IVec3& localSize,
1039 const tcu::IVec3& workSize);
1040
1041 static BufferToBufferInvertTest* CopyInvertSSBOCase (tcu::TestContext& testCtx,
1042 const std::string& name,
1043 const std::string& description,
1044 const deUint32 numValues,
1045 const tcu::IVec3& localSize,
1046 const tcu::IVec3& workSize);
1047
1048 private:
1049 BufferToBufferInvertTest (tcu::TestContext& testCtx,
1050 const std::string& name,
1051 const std::string& description,
1052 const deUint32 numValues,
1053 const tcu::IVec3& localSize,
1054 const tcu::IVec3& workSize,
1055 const BufferType bufferType);
1056
1057 const BufferType m_bufferType;
1058 const deUint32 m_numValues;
1059 const tcu::IVec3 m_localSize;
1060 const tcu::IVec3 m_workSize;
1061 };
1062
1063 class BufferToBufferInvertTestInstance : public vkt::TestInstance
1064 {
1065 public:
1066 BufferToBufferInvertTestInstance (Context& context,
1067 const deUint32 numValues,
1068 const tcu::IVec3& localSize,
1069 const tcu::IVec3& workSize,
1070 const BufferType bufferType);
1071
1072 tcu::TestStatus iterate (void);
1073
1074 private:
1075 const BufferType m_bufferType;
1076 const deUint32 m_numValues;
1077 const tcu::IVec3 m_localSize;
1078 const tcu::IVec3 m_workSize;
1079 };
1080
BufferToBufferInvertTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const BufferType bufferType)1081 BufferToBufferInvertTest::BufferToBufferInvertTest (tcu::TestContext& testCtx,
1082 const std::string& name,
1083 const std::string& description,
1084 const deUint32 numValues,
1085 const tcu::IVec3& localSize,
1086 const tcu::IVec3& workSize,
1087 const BufferType bufferType)
1088 : TestCase (testCtx, name, description)
1089 , m_bufferType (bufferType)
1090 , m_numValues (numValues)
1091 , m_localSize (localSize)
1092 , m_workSize (workSize)
1093 {
1094 DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1095 DE_ASSERT(m_bufferType == BUFFER_TYPE_UNIFORM || m_bufferType == BUFFER_TYPE_SSBO);
1096 }
1097
UBOToSSBOInvertCase(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)1098 BufferToBufferInvertTest* BufferToBufferInvertTest::UBOToSSBOInvertCase (tcu::TestContext& testCtx,
1099 const std::string& name,
1100 const std::string& description,
1101 const deUint32 numValues,
1102 const tcu::IVec3& localSize,
1103 const tcu::IVec3& workSize)
1104 {
1105 return new BufferToBufferInvertTest(testCtx, name, description, numValues, localSize, workSize, BUFFER_TYPE_UNIFORM);
1106 }
1107
CopyInvertSSBOCase(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)1108 BufferToBufferInvertTest* BufferToBufferInvertTest::CopyInvertSSBOCase (tcu::TestContext& testCtx,
1109 const std::string& name,
1110 const std::string& description,
1111 const deUint32 numValues,
1112 const tcu::IVec3& localSize,
1113 const tcu::IVec3& workSize)
1114 {
1115 return new BufferToBufferInvertTest(testCtx, name, description, numValues, localSize, workSize, BUFFER_TYPE_SSBO);
1116 }
1117
initPrograms(SourceCollections & sourceCollections) const1118 void BufferToBufferInvertTest::initPrograms (SourceCollections& sourceCollections) const
1119 {
1120 std::ostringstream src;
1121 if (m_bufferType == BUFFER_TYPE_UNIFORM)
1122 {
1123 src << "#version 310 es\n"
1124 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1125 << "layout(binding = 0) readonly uniform Input {\n"
1126 << " uint values[" << m_numValues << "];\n"
1127 << "} ub_in;\n"
1128 << "layout(binding = 1, std140) writeonly buffer Output {\n"
1129 << " uint values[" << m_numValues << "];\n"
1130 << "} sb_out;\n"
1131 << "void main (void) {\n"
1132 << " uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1133 << " uint numValuesPerInv = uint(ub_in.values.length()) / (size.x*size.y*size.z);\n"
1134 << " uint groupNdx = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1135 << " uint offset = numValuesPerInv*groupNdx;\n"
1136 << "\n"
1137 << " for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1138 << " sb_out.values[offset + ndx] = ~ub_in.values[offset + ndx];\n"
1139 << "}\n";
1140 }
1141 else if (m_bufferType == BUFFER_TYPE_SSBO)
1142 {
1143 src << "#version 310 es\n"
1144 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1145 << "layout(binding = 0, std140) readonly buffer Input {\n"
1146 << " uint values[" << m_numValues << "];\n"
1147 << "} sb_in;\n"
1148 << "layout (binding = 1, std140) writeonly buffer Output {\n"
1149 << " uint values[" << m_numValues << "];\n"
1150 << "} sb_out;\n"
1151 << "void main (void) {\n"
1152 << " uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1153 << " uint numValuesPerInv = uint(sb_in.values.length()) / (size.x*size.y*size.z);\n"
1154 << " uint groupNdx = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1155 << " uint offset = numValuesPerInv*groupNdx;\n"
1156 << "\n"
1157 << " for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1158 << " sb_out.values[offset + ndx] = ~sb_in.values[offset + ndx];\n"
1159 << "}\n";
1160 }
1161
1162 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1163 }
1164
createInstance(Context & context) const1165 TestInstance* BufferToBufferInvertTest::createInstance (Context& context) const
1166 {
1167 return new BufferToBufferInvertTestInstance(context, m_numValues, m_localSize, m_workSize, m_bufferType);
1168 }
1169
BufferToBufferInvertTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const BufferType bufferType)1170 BufferToBufferInvertTestInstance::BufferToBufferInvertTestInstance (Context& context,
1171 const deUint32 numValues,
1172 const tcu::IVec3& localSize,
1173 const tcu::IVec3& workSize,
1174 const BufferType bufferType)
1175 : TestInstance (context)
1176 , m_bufferType (bufferType)
1177 , m_numValues (numValues)
1178 , m_localSize (localSize)
1179 , m_workSize (workSize)
1180 {
1181 }
1182
iterate(void)1183 tcu::TestStatus BufferToBufferInvertTestInstance::iterate (void)
1184 {
1185 const DeviceInterface& vk = m_context.getDeviceInterface();
1186 const VkDevice device = m_context.getDevice();
1187 const VkQueue queue = m_context.getUniversalQueue();
1188 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
1189 Allocator& allocator = m_context.getDefaultAllocator();
1190
1191 // Customize the test based on buffer type
1192
1193 const VkBufferUsageFlags inputBufferUsageFlags = (m_bufferType == BUFFER_TYPE_UNIFORM ? VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT : VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
1194 const VkDescriptorType inputBufferDescriptorType = (m_bufferType == BUFFER_TYPE_UNIFORM ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1195 const deUint32 randomSeed = (m_bufferType == BUFFER_TYPE_UNIFORM ? 0x111223f : 0x124fef);
1196
1197 // Create an input buffer
1198
1199 const VkDeviceSize bufferSizeBytes = sizeof(tcu::UVec4) * m_numValues;
1200 const Buffer inputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, inputBufferUsageFlags), MemoryRequirement::HostVisible);
1201
1202 // Fill the input buffer with data
1203 {
1204 de::Random rnd(randomSeed);
1205 const Allocation& inputBufferAllocation = inputBuffer.getAllocation();
1206 tcu::UVec4* bufferPtr = static_cast<tcu::UVec4*>(inputBufferAllocation.getHostPtr());
1207 for (deUint32 i = 0; i < m_numValues; ++i)
1208 bufferPtr[i].x() = rnd.getUint32();
1209
1210 flushAlloc(vk, device, inputBufferAllocation);
1211 }
1212
1213 // Create an output buffer
1214
1215 const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1216
1217 // Create descriptor set
1218
1219 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1220 DescriptorSetLayoutBuilder()
1221 .addSingleBinding(inputBufferDescriptorType, VK_SHADER_STAGE_COMPUTE_BIT)
1222 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1223 .build(vk, device));
1224
1225 const Unique<VkDescriptorPool> descriptorPool(
1226 DescriptorPoolBuilder()
1227 .addType(inputBufferDescriptorType)
1228 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
1229 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1230
1231 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1232
1233 const VkDescriptorBufferInfo inputBufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, bufferSizeBytes);
1234 const VkDescriptorBufferInfo outputBufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, bufferSizeBytes);
1235 DescriptorSetUpdateBuilder()
1236 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), inputBufferDescriptorType, &inputBufferDescriptorInfo)
1237 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
1238 .update(vk, device);
1239
1240 // Perform the computation
1241
1242 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
1243 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1244 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
1245
1246 const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, bufferSizeBytes);
1247
1248 const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, bufferSizeBytes);
1249
1250 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1251 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1252
1253 // Start recording commands
1254
1255 beginCommandBuffer(vk, *cmdBuffer);
1256
1257 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1258 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1259
1260 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1261 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1262 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1263
1264 endCommandBuffer(vk, *cmdBuffer);
1265
1266 // Wait for completion
1267
1268 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1269
1270 // Validate the results
1271
1272 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1273 invalidateAlloc(vk, device, outputBufferAllocation);
1274
1275 const tcu::UVec4* bufferPtr = static_cast<tcu::UVec4*>(outputBufferAllocation.getHostPtr());
1276 const tcu::UVec4* refBufferPtr = static_cast<tcu::UVec4*>(inputBuffer.getAllocation().getHostPtr());
1277
1278 for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1279 {
1280 const deUint32 res = bufferPtr[ndx].x();
1281 const deUint32 ref = ~refBufferPtr[ndx].x();
1282
1283 if (res != ref)
1284 {
1285 std::ostringstream msg;
1286 msg << "Comparison failed for Output.values[" << ndx << "]";
1287 return tcu::TestStatus::fail(msg.str());
1288 }
1289 }
1290 return tcu::TestStatus::pass("Compute succeeded");
1291 }
1292
1293 class InvertSSBOInPlaceTest : public vkt::TestCase
1294 {
1295 public:
1296 InvertSSBOInPlaceTest (tcu::TestContext& testCtx,
1297 const std::string& name,
1298 const std::string& description,
1299 const deUint32 numValues,
1300 const bool sized,
1301 const tcu::IVec3& localSize,
1302 const tcu::IVec3& workSize);
1303
1304
1305 void initPrograms (SourceCollections& sourceCollections) const;
1306 TestInstance* createInstance (Context& context) const;
1307
1308 private:
1309 const deUint32 m_numValues;
1310 const bool m_sized;
1311 const tcu::IVec3 m_localSize;
1312 const tcu::IVec3 m_workSize;
1313 };
1314
1315 class InvertSSBOInPlaceTestInstance : public vkt::TestInstance
1316 {
1317 public:
1318 InvertSSBOInPlaceTestInstance (Context& context,
1319 const deUint32 numValues,
1320 const tcu::IVec3& localSize,
1321 const tcu::IVec3& workSize);
1322
1323 tcu::TestStatus iterate (void);
1324
1325 private:
1326 const deUint32 m_numValues;
1327 const tcu::IVec3 m_localSize;
1328 const tcu::IVec3 m_workSize;
1329 };
1330
InvertSSBOInPlaceTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const bool sized,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)1331 InvertSSBOInPlaceTest::InvertSSBOInPlaceTest (tcu::TestContext& testCtx,
1332 const std::string& name,
1333 const std::string& description,
1334 const deUint32 numValues,
1335 const bool sized,
1336 const tcu::IVec3& localSize,
1337 const tcu::IVec3& workSize)
1338 : TestCase (testCtx, name, description)
1339 , m_numValues (numValues)
1340 , m_sized (sized)
1341 , m_localSize (localSize)
1342 , m_workSize (workSize)
1343 {
1344 DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1345 }
1346
initPrograms(SourceCollections & sourceCollections) const1347 void InvertSSBOInPlaceTest::initPrograms (SourceCollections& sourceCollections) const
1348 {
1349 std::ostringstream src;
1350 src << "#version 310 es\n"
1351 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1352 << "layout(binding = 0) buffer InOut {\n"
1353 << " uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1354 << "} sb_inout;\n"
1355 << "void main (void) {\n"
1356 << " uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1357 << " uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
1358 << " uint groupNdx = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1359 << " uint offset = numValuesPerInv*groupNdx;\n"
1360 << "\n"
1361 << " for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1362 << " sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
1363 << "}\n";
1364
1365 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1366 }
1367
createInstance(Context & context) const1368 TestInstance* InvertSSBOInPlaceTest::createInstance (Context& context) const
1369 {
1370 return new InvertSSBOInPlaceTestInstance(context, m_numValues, m_localSize, m_workSize);
1371 }
1372
InvertSSBOInPlaceTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)1373 InvertSSBOInPlaceTestInstance::InvertSSBOInPlaceTestInstance (Context& context,
1374 const deUint32 numValues,
1375 const tcu::IVec3& localSize,
1376 const tcu::IVec3& workSize)
1377 : TestInstance (context)
1378 , m_numValues (numValues)
1379 , m_localSize (localSize)
1380 , m_workSize (workSize)
1381 {
1382 }
1383
iterate(void)1384 tcu::TestStatus InvertSSBOInPlaceTestInstance::iterate (void)
1385 {
1386 const DeviceInterface& vk = m_context.getDeviceInterface();
1387 const VkDevice device = m_context.getDevice();
1388 const VkQueue queue = m_context.getUniversalQueue();
1389 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
1390 Allocator& allocator = m_context.getDefaultAllocator();
1391
1392 // Create an input/output buffer
1393
1394 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
1395 const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1396
1397 // Fill the buffer with data
1398
1399 typedef std::vector<deUint32> data_vector_t;
1400 data_vector_t inputData(m_numValues);
1401
1402 {
1403 de::Random rnd(0x82ce7f);
1404 const Allocation& bufferAllocation = buffer.getAllocation();
1405 deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
1406 for (deUint32 i = 0; i < m_numValues; ++i)
1407 inputData[i] = *bufferPtr++ = rnd.getUint32();
1408
1409 flushAlloc(vk, device, bufferAllocation);
1410 }
1411
1412 // Create descriptor set
1413
1414 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1415 DescriptorSetLayoutBuilder()
1416 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1417 .build(vk, device));
1418
1419 const Unique<VkDescriptorPool> descriptorPool(
1420 DescriptorPoolBuilder()
1421 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
1422 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1423
1424 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1425
1426 const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
1427 DescriptorSetUpdateBuilder()
1428 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
1429 .update(vk, device);
1430
1431 // Perform the computation
1432
1433 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
1434 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1435 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
1436
1437 const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer, 0ull, bufferSizeBytes);
1438
1439 const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
1440
1441 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1442 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1443
1444 // Start recording commands
1445
1446 beginCommandBuffer(vk, *cmdBuffer);
1447
1448 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1449 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1450
1451 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1452 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1453 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1454
1455 endCommandBuffer(vk, *cmdBuffer);
1456
1457 // Wait for completion
1458
1459 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1460
1461 // Validate the results
1462
1463 const Allocation& bufferAllocation = buffer.getAllocation();
1464 invalidateAlloc(vk, device, bufferAllocation);
1465
1466 const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
1467
1468 for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1469 {
1470 const deUint32 res = bufferPtr[ndx];
1471 const deUint32 ref = ~inputData[ndx];
1472
1473 if (res != ref)
1474 {
1475 std::ostringstream msg;
1476 msg << "Comparison failed for InOut.values[" << ndx << "]";
1477 return tcu::TestStatus::fail(msg.str());
1478 }
1479 }
1480 return tcu::TestStatus::pass("Compute succeeded");
1481 }
1482
1483 class WriteToMultipleSSBOTest : public vkt::TestCase
1484 {
1485 public:
1486 WriteToMultipleSSBOTest (tcu::TestContext& testCtx,
1487 const std::string& name,
1488 const std::string& description,
1489 const deUint32 numValues,
1490 const bool sized,
1491 const tcu::IVec3& localSize,
1492 const tcu::IVec3& workSize);
1493
1494 void initPrograms (SourceCollections& sourceCollections) const;
1495 TestInstance* createInstance (Context& context) const;
1496
1497 private:
1498 const deUint32 m_numValues;
1499 const bool m_sized;
1500 const tcu::IVec3 m_localSize;
1501 const tcu::IVec3 m_workSize;
1502 };
1503
1504 class WriteToMultipleSSBOTestInstance : public vkt::TestInstance
1505 {
1506 public:
1507 WriteToMultipleSSBOTestInstance (Context& context,
1508 const deUint32 numValues,
1509 const tcu::IVec3& localSize,
1510 const tcu::IVec3& workSize);
1511
1512 tcu::TestStatus iterate (void);
1513
1514 private:
1515 const deUint32 m_numValues;
1516 const tcu::IVec3 m_localSize;
1517 const tcu::IVec3 m_workSize;
1518 };
1519
WriteToMultipleSSBOTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const bool sized,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)1520 WriteToMultipleSSBOTest::WriteToMultipleSSBOTest (tcu::TestContext& testCtx,
1521 const std::string& name,
1522 const std::string& description,
1523 const deUint32 numValues,
1524 const bool sized,
1525 const tcu::IVec3& localSize,
1526 const tcu::IVec3& workSize)
1527 : TestCase (testCtx, name, description)
1528 , m_numValues (numValues)
1529 , m_sized (sized)
1530 , m_localSize (localSize)
1531 , m_workSize (workSize)
1532 {
1533 DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1534 }
1535
initPrograms(SourceCollections & sourceCollections) const1536 void WriteToMultipleSSBOTest::initPrograms (SourceCollections& sourceCollections) const
1537 {
1538 std::ostringstream src;
1539 src << "#version 310 es\n"
1540 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1541 << "layout(binding = 0) writeonly buffer Out0 {\n"
1542 << " uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1543 << "} sb_out0;\n"
1544 << "layout(binding = 1) writeonly buffer Out1 {\n"
1545 << " uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1546 << "} sb_out1;\n"
1547 << "void main (void) {\n"
1548 << " uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1549 << " uint groupNdx = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1550 << "\n"
1551 << " {\n"
1552 << " uint numValuesPerInv = uint(sb_out0.values.length()) / (size.x*size.y*size.z);\n"
1553 << " uint offset = numValuesPerInv*groupNdx;\n"
1554 << "\n"
1555 << " for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1556 << " sb_out0.values[offset + ndx] = offset + ndx;\n"
1557 << " }\n"
1558 << " {\n"
1559 << " uint numValuesPerInv = uint(sb_out1.values.length()) / (size.x*size.y*size.z);\n"
1560 << " uint offset = numValuesPerInv*groupNdx;\n"
1561 << "\n"
1562 << " for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1563 << " sb_out1.values[offset + ndx] = uint(sb_out1.values.length()) - offset - ndx;\n"
1564 << " }\n"
1565 << "}\n";
1566
1567 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1568 }
1569
createInstance(Context & context) const1570 TestInstance* WriteToMultipleSSBOTest::createInstance (Context& context) const
1571 {
1572 return new WriteToMultipleSSBOTestInstance(context, m_numValues, m_localSize, m_workSize);
1573 }
1574
WriteToMultipleSSBOTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize)1575 WriteToMultipleSSBOTestInstance::WriteToMultipleSSBOTestInstance (Context& context,
1576 const deUint32 numValues,
1577 const tcu::IVec3& localSize,
1578 const tcu::IVec3& workSize)
1579 : TestInstance (context)
1580 , m_numValues (numValues)
1581 , m_localSize (localSize)
1582 , m_workSize (workSize)
1583 {
1584 }
1585
iterate(void)1586 tcu::TestStatus WriteToMultipleSSBOTestInstance::iterate (void)
1587 {
1588 const DeviceInterface& vk = m_context.getDeviceInterface();
1589 const VkDevice device = m_context.getDevice();
1590 const VkQueue queue = m_context.getUniversalQueue();
1591 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
1592 Allocator& allocator = m_context.getDefaultAllocator();
1593
1594 // Create two output buffers
1595
1596 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
1597 const Buffer buffer0(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1598 const Buffer buffer1(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1599
1600 // Create descriptor set
1601
1602 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1603 DescriptorSetLayoutBuilder()
1604 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1605 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1606 .build(vk, device));
1607
1608 const Unique<VkDescriptorPool> descriptorPool(
1609 DescriptorPoolBuilder()
1610 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2u)
1611 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1612
1613 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1614
1615 const VkDescriptorBufferInfo buffer0DescriptorInfo = makeDescriptorBufferInfo(*buffer0, 0ull, bufferSizeBytes);
1616 const VkDescriptorBufferInfo buffer1DescriptorInfo = makeDescriptorBufferInfo(*buffer1, 0ull, bufferSizeBytes);
1617 DescriptorSetUpdateBuilder()
1618 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &buffer0DescriptorInfo)
1619 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &buffer1DescriptorInfo)
1620 .update(vk, device);
1621
1622 // Perform the computation
1623
1624 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
1625 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1626 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
1627
1628 const VkBufferMemoryBarrier shaderWriteBarriers[] =
1629 {
1630 makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer0, 0ull, bufferSizeBytes),
1631 makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer1, 0ull, bufferSizeBytes)
1632 };
1633
1634 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1635 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1636
1637 // Start recording commands
1638
1639 beginCommandBuffer(vk, *cmdBuffer);
1640
1641 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1642 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1643
1644 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1645 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, DE_LENGTH_OF_ARRAY(shaderWriteBarriers), shaderWriteBarriers, 0, (const VkImageMemoryBarrier*)DE_NULL);
1646
1647 endCommandBuffer(vk, *cmdBuffer);
1648
1649 // Wait for completion
1650
1651 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1652
1653 // Validate the results
1654 {
1655 const Allocation& buffer0Allocation = buffer0.getAllocation();
1656 invalidateAlloc(vk, device, buffer0Allocation);
1657 const deUint32* buffer0Ptr = static_cast<deUint32*>(buffer0Allocation.getHostPtr());
1658
1659 for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1660 {
1661 const deUint32 res = buffer0Ptr[ndx];
1662 const deUint32 ref = ndx;
1663
1664 if (res != ref)
1665 {
1666 std::ostringstream msg;
1667 msg << "Comparison failed for Out0.values[" << ndx << "] res=" << res << " ref=" << ref;
1668 return tcu::TestStatus::fail(msg.str());
1669 }
1670 }
1671 }
1672 {
1673 const Allocation& buffer1Allocation = buffer1.getAllocation();
1674 invalidateAlloc(vk, device, buffer1Allocation);
1675 const deUint32* buffer1Ptr = static_cast<deUint32*>(buffer1Allocation.getHostPtr());
1676
1677 for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1678 {
1679 const deUint32 res = buffer1Ptr[ndx];
1680 const deUint32 ref = m_numValues - ndx;
1681
1682 if (res != ref)
1683 {
1684 std::ostringstream msg;
1685 msg << "Comparison failed for Out1.values[" << ndx << "] res=" << res << " ref=" << ref;
1686 return tcu::TestStatus::fail(msg.str());
1687 }
1688 }
1689 }
1690 return tcu::TestStatus::pass("Compute succeeded");
1691 }
1692
1693 class SSBOBarrierTest : public vkt::TestCase
1694 {
1695 public:
1696 SSBOBarrierTest (tcu::TestContext& testCtx,
1697 const std::string& name,
1698 const std::string& description,
1699 const tcu::IVec3& workSize);
1700
1701 void initPrograms (SourceCollections& sourceCollections) const;
1702 TestInstance* createInstance (Context& context) const;
1703
1704 private:
1705 const tcu::IVec3 m_workSize;
1706 };
1707
1708 class SSBOBarrierTestInstance : public vkt::TestInstance
1709 {
1710 public:
1711 SSBOBarrierTestInstance (Context& context,
1712 const tcu::IVec3& workSize);
1713
1714 tcu::TestStatus iterate (void);
1715
1716 private:
1717 const tcu::IVec3 m_workSize;
1718 };
1719
SSBOBarrierTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec3 & workSize)1720 SSBOBarrierTest::SSBOBarrierTest (tcu::TestContext& testCtx,
1721 const std::string& name,
1722 const std::string& description,
1723 const tcu::IVec3& workSize)
1724 : TestCase (testCtx, name, description)
1725 , m_workSize (workSize)
1726 {
1727 }
1728
initPrograms(SourceCollections & sourceCollections) const1729 void SSBOBarrierTest::initPrograms (SourceCollections& sourceCollections) const
1730 {
1731 sourceCollections.glslSources.add("comp0") << glu::ComputeSource(
1732 "#version 310 es\n"
1733 "layout (local_size_x = 1) in;\n"
1734 "layout(binding = 2) readonly uniform Constants {\n"
1735 " uint u_baseVal;\n"
1736 "};\n"
1737 "layout(binding = 1) writeonly buffer Output {\n"
1738 " uint values[];\n"
1739 "};\n"
1740 "void main (void) {\n"
1741 " uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
1742 " values[offset] = u_baseVal + offset;\n"
1743 "}\n");
1744
1745 sourceCollections.glslSources.add("comp1") << glu::ComputeSource(
1746 "#version 310 es\n"
1747 "layout (local_size_x = 1) in;\n"
1748 "layout(binding = 1) readonly buffer Input {\n"
1749 " uint values[];\n"
1750 "};\n"
1751 "layout(binding = 0) coherent buffer Output {\n"
1752 " uint sum;\n"
1753 "};\n"
1754 "void main (void) {\n"
1755 " uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
1756 " uint value = values[offset];\n"
1757 " atomicAdd(sum, value);\n"
1758 "}\n");
1759 }
1760
createInstance(Context & context) const1761 TestInstance* SSBOBarrierTest::createInstance (Context& context) const
1762 {
1763 return new SSBOBarrierTestInstance(context, m_workSize);
1764 }
1765
SSBOBarrierTestInstance(Context & context,const tcu::IVec3 & workSize)1766 SSBOBarrierTestInstance::SSBOBarrierTestInstance (Context& context, const tcu::IVec3& workSize)
1767 : TestInstance (context)
1768 , m_workSize (workSize)
1769 {
1770 }
1771
iterate(void)1772 tcu::TestStatus SSBOBarrierTestInstance::iterate (void)
1773 {
1774 const DeviceInterface& vk = m_context.getDeviceInterface();
1775 const VkDevice device = m_context.getDevice();
1776 const VkQueue queue = m_context.getUniversalQueue();
1777 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
1778 Allocator& allocator = m_context.getDefaultAllocator();
1779
1780 // Create a work buffer used by both shaders
1781
1782 const int workGroupCount = multiplyComponents(m_workSize);
1783 const VkDeviceSize workBufferSizeBytes = sizeof(deUint32) * workGroupCount;
1784 const Buffer workBuffer(vk, device, allocator, makeBufferCreateInfo(workBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::Any);
1785
1786 // Create an output buffer
1787
1788 const VkDeviceSize outputBufferSizeBytes = sizeof(deUint32);
1789 const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1790
1791 // Initialize atomic counter value to zero
1792 {
1793 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1794 deUint32* outputBufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
1795 *outputBufferPtr = 0;
1796 flushAlloc(vk, device, outputBufferAllocation);
1797 }
1798
1799 // Create a uniform buffer (to pass uniform constants)
1800
1801 const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32);
1802 const Buffer uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
1803
1804 // Set the constants in the uniform buffer
1805
1806 const deUint32 baseValue = 127;
1807 {
1808 const Allocation& uniformBufferAllocation = uniformBuffer.getAllocation();
1809 deUint32* uniformBufferPtr = static_cast<deUint32*>(uniformBufferAllocation.getHostPtr());
1810 uniformBufferPtr[0] = baseValue;
1811
1812 flushAlloc(vk, device, uniformBufferAllocation);
1813 }
1814
1815 // Create descriptor set
1816
1817 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1818 DescriptorSetLayoutBuilder()
1819 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1820 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1821 .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1822 .build(vk, device));
1823
1824 const Unique<VkDescriptorPool> descriptorPool(
1825 DescriptorPoolBuilder()
1826 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2u)
1827 .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
1828 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1829
1830 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1831
1832 const VkDescriptorBufferInfo workBufferDescriptorInfo = makeDescriptorBufferInfo(*workBuffer, 0ull, workBufferSizeBytes);
1833 const VkDescriptorBufferInfo outputBufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, outputBufferSizeBytes);
1834 const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
1835 DescriptorSetUpdateBuilder()
1836 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
1837 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &workBufferDescriptorInfo)
1838 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(2u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
1839 .update(vk, device);
1840
1841 // Perform the computation
1842
1843 const Unique<VkShaderModule> shaderModule0(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp0"), 0));
1844 const Unique<VkShaderModule> shaderModule1(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp1"), 0));
1845
1846 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1847 const Unique<VkPipeline> pipeline0(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule0));
1848 const Unique<VkPipeline> pipeline1(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule1));
1849
1850 const VkBufferMemoryBarrier writeUniformConstantsBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
1851
1852 const VkBufferMemoryBarrier betweenShadersBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *workBuffer, 0ull, workBufferSizeBytes);
1853
1854 const VkBufferMemoryBarrier afterComputeBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, outputBufferSizeBytes);
1855
1856 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1857 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1858
1859 // Start recording commands
1860
1861 beginCommandBuffer(vk, *cmdBuffer);
1862
1863 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline0);
1864 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1865
1866 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &writeUniformConstantsBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1867
1868 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1869 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &betweenShadersBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1870
1871 // Switch to the second shader program
1872 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline1);
1873
1874 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1875 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &afterComputeBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1876
1877 endCommandBuffer(vk, *cmdBuffer);
1878
1879 // Wait for completion
1880
1881 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1882
1883 // Validate the results
1884
1885 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1886 invalidateAlloc(vk, device, outputBufferAllocation);
1887
1888 const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
1889 const deUint32 res = *bufferPtr;
1890 deUint32 ref = 0;
1891
1892 for (int ndx = 0; ndx < workGroupCount; ++ndx)
1893 ref += baseValue + ndx;
1894
1895 if (res != ref)
1896 {
1897 std::ostringstream msg;
1898 msg << "ERROR: comparison failed, expected " << ref << ", got " << res;
1899 return tcu::TestStatus::fail(msg.str());
1900 }
1901 return tcu::TestStatus::pass("Compute succeeded");
1902 }
1903
1904 class ImageAtomicOpTest : public vkt::TestCase
1905 {
1906 public:
1907 ImageAtomicOpTest (tcu::TestContext& testCtx,
1908 const std::string& name,
1909 const std::string& description,
1910 const deUint32 localSize,
1911 const tcu::IVec2& imageSize);
1912
1913 void initPrograms (SourceCollections& sourceCollections) const;
1914 TestInstance* createInstance (Context& context) const;
1915
1916 private:
1917 const deUint32 m_localSize;
1918 const tcu::IVec2 m_imageSize;
1919 };
1920
1921 class ImageAtomicOpTestInstance : public vkt::TestInstance
1922 {
1923 public:
1924 ImageAtomicOpTestInstance (Context& context,
1925 const deUint32 localSize,
1926 const tcu::IVec2& imageSize);
1927
1928 tcu::TestStatus iterate (void);
1929
1930 private:
1931 const deUint32 m_localSize;
1932 const tcu::IVec2 m_imageSize;
1933 };
1934
ImageAtomicOpTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 localSize,const tcu::IVec2 & imageSize)1935 ImageAtomicOpTest::ImageAtomicOpTest (tcu::TestContext& testCtx,
1936 const std::string& name,
1937 const std::string& description,
1938 const deUint32 localSize,
1939 const tcu::IVec2& imageSize)
1940 : TestCase (testCtx, name, description)
1941 , m_localSize (localSize)
1942 , m_imageSize (imageSize)
1943 {
1944 }
1945
initPrograms(SourceCollections & sourceCollections) const1946 void ImageAtomicOpTest::initPrograms (SourceCollections& sourceCollections) const
1947 {
1948 std::ostringstream src;
1949 src << "#version 310 es\n"
1950 << "#extension GL_OES_shader_image_atomic : require\n"
1951 << "layout (local_size_x = " << m_localSize << ") in;\n"
1952 << "layout(binding = 1, r32ui) coherent uniform highp uimage2D u_dstImg;\n"
1953 << "layout(binding = 0) readonly buffer Input {\n"
1954 << " uint values[" << (multiplyComponents(m_imageSize) * m_localSize) << "];\n"
1955 << "} sb_in;\n\n"
1956 << "void main (void) {\n"
1957 << " uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
1958 << " uint value = sb_in.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x];\n"
1959 << "\n"
1960 << " if (gl_LocalInvocationIndex == 0u)\n"
1961 << " imageStore(u_dstImg, ivec2(gl_WorkGroupID.xy), uvec4(0));\n"
1962 << " memoryBarrierImage();\n"
1963 << " barrier();\n"
1964 << " imageAtomicAdd(u_dstImg, ivec2(gl_WorkGroupID.xy), value);\n"
1965 << "}\n";
1966
1967 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1968 }
1969
createInstance(Context & context) const1970 TestInstance* ImageAtomicOpTest::createInstance (Context& context) const
1971 {
1972 return new ImageAtomicOpTestInstance(context, m_localSize, m_imageSize);
1973 }
1974
ImageAtomicOpTestInstance(Context & context,const deUint32 localSize,const tcu::IVec2 & imageSize)1975 ImageAtomicOpTestInstance::ImageAtomicOpTestInstance (Context& context, const deUint32 localSize, const tcu::IVec2& imageSize)
1976 : TestInstance (context)
1977 , m_localSize (localSize)
1978 , m_imageSize (imageSize)
1979 {
1980 }
1981
iterate(void)1982 tcu::TestStatus ImageAtomicOpTestInstance::iterate (void)
1983 {
1984 const DeviceInterface& vk = m_context.getDeviceInterface();
1985 const VkDevice device = m_context.getDevice();
1986 const VkQueue queue = m_context.getUniversalQueue();
1987 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
1988 Allocator& allocator = m_context.getDefaultAllocator();
1989
1990 // Create an image
1991
1992 const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
1993 const Image image(vk, device, allocator, imageParams, MemoryRequirement::Any);
1994
1995 const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
1996 const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
1997
1998 // Input buffer
1999
2000 const deUint32 numInputValues = multiplyComponents(m_imageSize) * m_localSize;
2001 const VkDeviceSize inputBufferSizeBytes = sizeof(deUint32) * numInputValues;
2002
2003 const Buffer inputBuffer(vk, device, allocator, makeBufferCreateInfo(inputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
2004
2005 // Populate the input buffer with test data
2006 {
2007 de::Random rnd(0x77238ac2);
2008 const Allocation& inputBufferAllocation = inputBuffer.getAllocation();
2009 deUint32* bufferPtr = static_cast<deUint32*>(inputBufferAllocation.getHostPtr());
2010 for (deUint32 i = 0; i < numInputValues; ++i)
2011 *bufferPtr++ = rnd.getUint32();
2012
2013 flushAlloc(vk, device, inputBufferAllocation);
2014 }
2015
2016 // Create a buffer to store shader output (copied from image data)
2017
2018 const deUint32 imageArea = multiplyComponents(m_imageSize);
2019 const VkDeviceSize outputBufferSizeBytes = sizeof(deUint32) * imageArea;
2020 const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT), MemoryRequirement::HostVisible);
2021
2022 // Create descriptor set
2023
2024 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2025 DescriptorSetLayoutBuilder()
2026 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2027 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
2028 .build(vk, device));
2029
2030 const Unique<VkDescriptorPool> descriptorPool(
2031 DescriptorPoolBuilder()
2032 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2033 .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
2034 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2035
2036 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2037
2038 // Set the bindings
2039
2040 const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
2041 const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, inputBufferSizeBytes);
2042
2043 DescriptorSetUpdateBuilder()
2044 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2045 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
2046 .update(vk, device);
2047
2048 // Perform the computation
2049 {
2050 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
2051 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2052 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
2053
2054 const VkBufferMemoryBarrier inputBufferPostHostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, inputBufferSizeBytes);
2055
2056 const VkImageMemoryBarrier imageLayoutBarrier = makeImageMemoryBarrier(
2057 (VkAccessFlags)0, VK_ACCESS_SHADER_WRITE_BIT,
2058 VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
2059 *image, subresourceRange);
2060
2061 // Prepare the command buffer
2062
2063 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
2064 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2065
2066 // Start recording commands
2067
2068 beginCommandBuffer(vk, *cmdBuffer);
2069
2070 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
2071 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2072
2073 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &inputBufferPostHostWriteBarrier, 1, &imageLayoutBarrier);
2074 vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2075
2076 copyImageToBuffer(vk, *cmdBuffer, *image, *outputBuffer, m_imageSize, VK_ACCESS_SHADER_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL);
2077
2078 endCommandBuffer(vk, *cmdBuffer);
2079
2080 // Wait for completion
2081
2082 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2083 }
2084
2085 // Validate the results
2086
2087 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
2088 invalidateAlloc(vk, device, outputBufferAllocation);
2089
2090 const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
2091 const deUint32* refBufferPtr = static_cast<deUint32*>(inputBuffer.getAllocation().getHostPtr());
2092
2093 for (deUint32 pixelNdx = 0; pixelNdx < imageArea; ++pixelNdx)
2094 {
2095 const deUint32 res = bufferPtr[pixelNdx];
2096 deUint32 ref = 0;
2097
2098 for (deUint32 offs = 0; offs < m_localSize; ++offs)
2099 ref += refBufferPtr[pixelNdx * m_localSize + offs];
2100
2101 if (res != ref)
2102 {
2103 std::ostringstream msg;
2104 msg << "Comparison failed for pixel " << pixelNdx;
2105 return tcu::TestStatus::fail(msg.str());
2106 }
2107 }
2108 return tcu::TestStatus::pass("Compute succeeded");
2109 }
2110
2111 class ImageBarrierTest : public vkt::TestCase
2112 {
2113 public:
2114 ImageBarrierTest (tcu::TestContext& testCtx,
2115 const std::string& name,
2116 const std::string& description,
2117 const tcu::IVec2& imageSize);
2118
2119 void initPrograms (SourceCollections& sourceCollections) const;
2120 TestInstance* createInstance (Context& context) const;
2121
2122 private:
2123 const tcu::IVec2 m_imageSize;
2124 };
2125
2126 class ImageBarrierTestInstance : public vkt::TestInstance
2127 {
2128 public:
2129 ImageBarrierTestInstance (Context& context,
2130 const tcu::IVec2& imageSize);
2131
2132 tcu::TestStatus iterate (void);
2133
2134 private:
2135 const tcu::IVec2 m_imageSize;
2136 };
2137
ImageBarrierTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::IVec2 & imageSize)2138 ImageBarrierTest::ImageBarrierTest (tcu::TestContext& testCtx,
2139 const std::string& name,
2140 const std::string& description,
2141 const tcu::IVec2& imageSize)
2142 : TestCase (testCtx, name, description)
2143 , m_imageSize (imageSize)
2144 {
2145 }
2146
initPrograms(SourceCollections & sourceCollections) const2147 void ImageBarrierTest::initPrograms (SourceCollections& sourceCollections) const
2148 {
2149 sourceCollections.glslSources.add("comp0") << glu::ComputeSource(
2150 "#version 310 es\n"
2151 "layout (local_size_x = 1) in;\n"
2152 "layout(binding = 2) readonly uniform Constants {\n"
2153 " uint u_baseVal;\n"
2154 "};\n"
2155 "layout(binding = 1, r32ui) writeonly uniform highp uimage2D u_img;\n"
2156 "void main (void) {\n"
2157 " uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
2158 " imageStore(u_img, ivec2(gl_WorkGroupID.xy), uvec4(offset + u_baseVal, 0, 0, 0));\n"
2159 "}\n");
2160
2161 sourceCollections.glslSources.add("comp1") << glu::ComputeSource(
2162 "#version 310 es\n"
2163 "layout (local_size_x = 1) in;\n"
2164 "layout(binding = 1, r32ui) readonly uniform highp uimage2D u_img;\n"
2165 "layout(binding = 0) coherent buffer Output {\n"
2166 " uint sum;\n"
2167 "};\n"
2168 "void main (void) {\n"
2169 " uint value = imageLoad(u_img, ivec2(gl_WorkGroupID.xy)).x;\n"
2170 " atomicAdd(sum, value);\n"
2171 "}\n");
2172 }
2173
createInstance(Context & context) const2174 TestInstance* ImageBarrierTest::createInstance (Context& context) const
2175 {
2176 return new ImageBarrierTestInstance(context, m_imageSize);
2177 }
2178
ImageBarrierTestInstance(Context & context,const tcu::IVec2 & imageSize)2179 ImageBarrierTestInstance::ImageBarrierTestInstance (Context& context, const tcu::IVec2& imageSize)
2180 : TestInstance (context)
2181 , m_imageSize (imageSize)
2182 {
2183 }
2184
iterate(void)2185 tcu::TestStatus ImageBarrierTestInstance::iterate (void)
2186 {
2187 const DeviceInterface& vk = m_context.getDeviceInterface();
2188 const VkDevice device = m_context.getDevice();
2189 const VkQueue queue = m_context.getUniversalQueue();
2190 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
2191 Allocator& allocator = m_context.getDefaultAllocator();
2192
2193 // Create an image used by both shaders
2194
2195 const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_STORAGE_BIT);
2196 const Image image(vk, device, allocator, imageParams, MemoryRequirement::Any);
2197
2198 const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
2199 const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
2200
2201 // Create an output buffer
2202
2203 const VkDeviceSize outputBufferSizeBytes = sizeof(deUint32);
2204 const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
2205
2206 // Initialize atomic counter value to zero
2207 {
2208 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
2209 deUint32* outputBufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
2210 *outputBufferPtr = 0;
2211 flushAlloc(vk, device, outputBufferAllocation);
2212 }
2213
2214 // Create a uniform buffer (to pass uniform constants)
2215
2216 const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32);
2217 const Buffer uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
2218
2219 // Set the constants in the uniform buffer
2220
2221 const deUint32 baseValue = 127;
2222 {
2223 const Allocation& uniformBufferAllocation = uniformBuffer.getAllocation();
2224 deUint32* uniformBufferPtr = static_cast<deUint32*>(uniformBufferAllocation.getHostPtr());
2225 uniformBufferPtr[0] = baseValue;
2226
2227 flushAlloc(vk, device, uniformBufferAllocation);
2228 }
2229
2230 // Create descriptor set
2231
2232 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2233 DescriptorSetLayoutBuilder()
2234 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2235 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
2236 .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2237 .build(vk, device));
2238
2239 const Unique<VkDescriptorPool> descriptorPool(
2240 DescriptorPoolBuilder()
2241 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2242 .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
2243 .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2244 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2245
2246 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2247
2248 const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
2249 const VkDescriptorBufferInfo outputBufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, outputBufferSizeBytes);
2250 const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2251 DescriptorSetUpdateBuilder()
2252 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
2253 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
2254 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(2u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2255 .update(vk, device);
2256
2257 // Perform the computation
2258
2259 const Unique<VkShaderModule> shaderModule0(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp0"), 0));
2260 const Unique<VkShaderModule> shaderModule1(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp1"), 0));
2261
2262 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2263 const Unique<VkPipeline> pipeline0(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule0));
2264 const Unique<VkPipeline> pipeline1(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule1));
2265
2266 const VkBufferMemoryBarrier writeUniformConstantsBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2267
2268 const VkImageMemoryBarrier imageLayoutBarrier = makeImageMemoryBarrier(
2269 0u, 0u,
2270 VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
2271 *image, subresourceRange);
2272
2273 const VkImageMemoryBarrier imageBarrierBetweenShaders = makeImageMemoryBarrier(
2274 VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT,
2275 VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
2276 *image, subresourceRange);
2277
2278 const VkBufferMemoryBarrier afterComputeBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, outputBufferSizeBytes);
2279
2280 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
2281 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2282
2283 // Start recording commands
2284
2285 beginCommandBuffer(vk, *cmdBuffer);
2286
2287 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline0);
2288 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2289
2290 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &writeUniformConstantsBarrier, 1, &imageLayoutBarrier);
2291
2292 vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2293 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 0, (const VkBufferMemoryBarrier*)DE_NULL, 1, &imageBarrierBetweenShaders);
2294
2295 // Switch to the second shader program
2296 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline1);
2297
2298 vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2299 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &afterComputeBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2300
2301 endCommandBuffer(vk, *cmdBuffer);
2302
2303 // Wait for completion
2304
2305 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2306
2307 // Validate the results
2308
2309 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
2310 invalidateAlloc(vk, device, outputBufferAllocation);
2311
2312 const int numValues = multiplyComponents(m_imageSize);
2313 const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
2314 const deUint32 res = *bufferPtr;
2315 deUint32 ref = 0;
2316
2317 for (int ndx = 0; ndx < numValues; ++ndx)
2318 ref += baseValue + ndx;
2319
2320 if (res != ref)
2321 {
2322 std::ostringstream msg;
2323 msg << "ERROR: comparison failed, expected " << ref << ", got " << res;
2324 return tcu::TestStatus::fail(msg.str());
2325 }
2326 return tcu::TestStatus::pass("Compute succeeded");
2327 }
2328
2329 class ComputeTestInstance : public vkt::TestInstance
2330 {
2331 public:
ComputeTestInstance(Context & context)2332 ComputeTestInstance (Context& context)
2333 : TestInstance (context)
2334 , m_numPhysDevices (1)
2335 , m_queueFamilyIndex (0)
2336 {
2337 createDeviceGroup();
2338 }
2339
2340 void createDeviceGroup (void);
getDeviceInterface(void)2341 const vk::DeviceInterface& getDeviceInterface (void) { return *m_deviceDriver; }
getInstance(void)2342 vk::VkInstance getInstance (void) { return m_deviceGroupInstance; }
getDevice(void)2343 vk::VkDevice getDevice (void) { return *m_logicalDevice; }
getPhysicalDevice(deUint32 i=0)2344 vk::VkPhysicalDevice getPhysicalDevice (deUint32 i = 0){ return m_physicalDevices[i]; }
2345
2346 protected:
2347 deUint32 m_numPhysDevices;
2348 deUint32 m_queueFamilyIndex;
2349
2350 private:
2351 CustomInstance m_deviceGroupInstance;
2352 vk::Move<vk::VkDevice> m_logicalDevice;
2353 std::vector<vk::VkPhysicalDevice> m_physicalDevices;
2354 de::MovePtr<vk::DeviceDriver> m_deviceDriver;
2355 };
2356
createDeviceGroup(void)2357 void ComputeTestInstance::createDeviceGroup (void)
2358 {
2359 const tcu::CommandLine& cmdLine = m_context.getTestContext().getCommandLine();
2360 const deUint32 devGroupIdx = cmdLine.getVKDeviceGroupId() - 1;
2361 const deUint32 physDeviceIdx = cmdLine.getVKDeviceId() - 1;
2362 const float queuePriority = 1.0f;
2363 const std::vector<std::string> requiredExtensions (1, "VK_KHR_device_group_creation");
2364 m_deviceGroupInstance = createCustomInstanceWithExtensions(m_context, requiredExtensions);
2365 std::vector<VkPhysicalDeviceGroupProperties> devGroupProperties = enumeratePhysicalDeviceGroups(m_context.getInstanceInterface(), m_deviceGroupInstance);
2366 m_numPhysDevices = devGroupProperties[devGroupIdx].physicalDeviceCount;
2367 std::vector<const char*> deviceExtensions;
2368
2369 if (!isCoreDeviceExtension(m_context.getUsedApiVersion(), "VK_KHR_device_group"))
2370 deviceExtensions.push_back("VK_KHR_device_group");
2371
2372 VkDeviceGroupDeviceCreateInfo deviceGroupInfo =
2373 {
2374 VK_STRUCTURE_TYPE_DEVICE_GROUP_DEVICE_CREATE_INFO_KHR, //stype
2375 DE_NULL, //pNext
2376 devGroupProperties[devGroupIdx].physicalDeviceCount, //physicalDeviceCount
2377 devGroupProperties[devGroupIdx].physicalDevices //physicalDevices
2378 };
2379 const InstanceDriver& instance (m_deviceGroupInstance.getDriver());
2380 const VkPhysicalDeviceFeatures deviceFeatures = getPhysicalDeviceFeatures(instance, deviceGroupInfo.pPhysicalDevices[physDeviceIdx]);
2381 const std::vector<VkQueueFamilyProperties> queueProps = getPhysicalDeviceQueueFamilyProperties(instance, devGroupProperties[devGroupIdx].physicalDevices[physDeviceIdx]);
2382
2383 m_physicalDevices.resize(m_numPhysDevices);
2384 for (deUint32 physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2385 m_physicalDevices[physDevIdx] = devGroupProperties[devGroupIdx].physicalDevices[physDevIdx];
2386
2387 for (size_t queueNdx = 0; queueNdx < queueProps.size(); queueNdx++)
2388 {
2389 if (queueProps[queueNdx].queueFlags & VK_QUEUE_COMPUTE_BIT)
2390 m_queueFamilyIndex = (deUint32)queueNdx;
2391 }
2392
2393 VkDeviceQueueCreateInfo queueInfo =
2394 {
2395 VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, // VkStructureType sType;
2396 DE_NULL, // const void* pNext;
2397 (VkDeviceQueueCreateFlags)0u, // VkDeviceQueueCreateFlags flags;
2398 m_queueFamilyIndex, // deUint32 queueFamilyIndex;
2399 1u, // deUint32 queueCount;
2400 &queuePriority // const float* pQueuePriorities;
2401 };
2402
2403 const VkDeviceCreateInfo deviceInfo =
2404 {
2405 VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, // VkStructureType sType;
2406 &deviceGroupInfo, // const void* pNext;
2407 (VkDeviceCreateFlags)0, // VkDeviceCreateFlags flags;
2408 1u , // uint32_t queueCreateInfoCount;
2409 &queueInfo, // const VkDeviceQueueCreateInfo* pQueueCreateInfos;
2410 0u, // uint32_t enabledLayerCount;
2411 DE_NULL, // const char* const* ppEnabledLayerNames;
2412 deUint32(deviceExtensions.size()), // uint32_t enabledExtensionCount;
2413 (deviceExtensions.empty() ? DE_NULL : &deviceExtensions[0]), // const char* const* ppEnabledExtensionNames;
2414 &deviceFeatures, // const VkPhysicalDeviceFeatures* pEnabledFeatures;
2415 };
2416
2417 m_logicalDevice = createCustomDevice(m_context.getTestContext().getCommandLine().isValidationEnabled(), m_context.getPlatformInterface(), m_deviceGroupInstance, instance, deviceGroupInfo.pPhysicalDevices[physDeviceIdx], &deviceInfo);
2418 m_deviceDriver = de::MovePtr<DeviceDriver>(new DeviceDriver(m_context.getPlatformInterface(), m_deviceGroupInstance, *m_logicalDevice));
2419 }
2420
2421 class DispatchBaseTest : public vkt::TestCase
2422 {
2423 public:
2424 DispatchBaseTest (tcu::TestContext& testCtx,
2425 const std::string& name,
2426 const std::string& description,
2427 const deUint32 numValues,
2428 const tcu::IVec3& localsize,
2429 const tcu::IVec3& worksize,
2430 const tcu::IVec3& splitsize);
2431
2432 void initPrograms (SourceCollections& sourceCollections) const;
2433 TestInstance* createInstance (Context& context) const;
2434
2435 private:
2436 const deUint32 m_numValues;
2437 const tcu::IVec3 m_localSize;
2438 const tcu::IVec3 m_workSize;
2439 const tcu::IVec3 m_splitSize;
2440 };
2441
2442 class DispatchBaseTestInstance : public ComputeTestInstance
2443 {
2444 public:
2445 DispatchBaseTestInstance (Context& context,
2446 const deUint32 numValues,
2447 const tcu::IVec3& localsize,
2448 const tcu::IVec3& worksize,
2449 const tcu::IVec3& splitsize);
2450
2451 bool isInputVectorValid (const tcu::IVec3& small, const tcu::IVec3& big);
2452 tcu::TestStatus iterate (void);
2453
2454 private:
2455 const deUint32 m_numValues;
2456 const tcu::IVec3 m_localSize;
2457 const tcu::IVec3 m_workSize;
2458 const tcu::IVec3 m_splitWorkSize;
2459 };
2460
DispatchBaseTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize,const tcu::IVec3 & splitsize)2461 DispatchBaseTest::DispatchBaseTest (tcu::TestContext& testCtx,
2462 const std::string& name,
2463 const std::string& description,
2464 const deUint32 numValues,
2465 const tcu::IVec3& localsize,
2466 const tcu::IVec3& worksize,
2467 const tcu::IVec3& splitsize)
2468 : TestCase (testCtx, name, description)
2469 , m_numValues (numValues)
2470 , m_localSize (localsize)
2471 , m_workSize (worksize)
2472 , m_splitSize (splitsize)
2473 {
2474 }
2475
initPrograms(SourceCollections & sourceCollections) const2476 void DispatchBaseTest::initPrograms (SourceCollections& sourceCollections) const
2477 {
2478 std::ostringstream src;
2479 src << "#version 310 es\n"
2480 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
2481
2482 << "layout(binding = 0) buffer InOut {\n"
2483 << " uint values[" << de::toString(m_numValues) << "];\n"
2484 << "} sb_inout;\n"
2485
2486 << "layout(binding = 1) readonly uniform uniformInput {\n"
2487 << " uvec3 gridSize;\n"
2488 << "} ubo_in;\n"
2489
2490 << "void main (void) {\n"
2491 << " uvec3 size = ubo_in.gridSize * gl_WorkGroupSize;\n"
2492 << " uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
2493 << " uint index = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
2494 << " uint offset = numValuesPerInv*index;\n"
2495 << " for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
2496 << " sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
2497 << "}\n";
2498
2499 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
2500 }
2501
createInstance(Context & context) const2502 TestInstance* DispatchBaseTest::createInstance (Context& context) const
2503 {
2504 return new DispatchBaseTestInstance(context, m_numValues, m_localSize, m_workSize, m_splitSize);
2505 }
2506
DispatchBaseTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize,const tcu::IVec3 & splitsize)2507 DispatchBaseTestInstance::DispatchBaseTestInstance (Context& context,
2508 const deUint32 numValues,
2509 const tcu::IVec3& localsize,
2510 const tcu::IVec3& worksize,
2511 const tcu::IVec3& splitsize)
2512
2513 : ComputeTestInstance (context)
2514 , m_numValues (numValues)
2515 , m_localSize (localsize)
2516 , m_workSize (worksize)
2517 , m_splitWorkSize (splitsize)
2518 {
2519 // For easy work distribution across physical devices:
2520 // WorkSize should be a multiple of SplitWorkSize only in the X component
2521 if ((!isInputVectorValid(m_splitWorkSize, m_workSize)) ||
2522 (m_workSize.x() <= m_splitWorkSize.x()) ||
2523 (m_workSize.y() != m_splitWorkSize.y()) ||
2524 (m_workSize.z() != m_splitWorkSize.z()))
2525 TCU_THROW(TestError, "Invalid Input.");
2526
2527 // For easy work distribution within the same physical device:
2528 // SplitWorkSize should be a multiple of localSize in Y or Z component
2529 if ((!isInputVectorValid(m_localSize, m_splitWorkSize)) ||
2530 (m_localSize.x() != m_splitWorkSize.x()) ||
2531 (m_localSize.y() >= m_splitWorkSize.y()) ||
2532 (m_localSize.z() >= m_splitWorkSize.z()))
2533 TCU_THROW(TestError, "Invalid Input.");
2534
2535 if ((multiplyComponents(m_workSize) / multiplyComponents(m_splitWorkSize)) < (deInt32) m_numPhysDevices)
2536 TCU_THROW(TestError, "Not enough work to distribute across all physical devices.");
2537
2538 deUint32 totalWork = multiplyComponents(m_workSize) * multiplyComponents(m_localSize);
2539 if ((totalWork > numValues) || (numValues % totalWork != 0))
2540 TCU_THROW(TestError, "Buffer too small/not aligned to cover all values.");
2541 }
2542
isInputVectorValid(const tcu::IVec3 & small,const tcu::IVec3 & big)2543 bool DispatchBaseTestInstance::isInputVectorValid(const tcu::IVec3& small, const tcu::IVec3& big)
2544 {
2545 if (((big.x() < small.x()) || (big.y() < small.y()) || (big.z() < small.z())) ||
2546 ((big.x() % small.x() != 0) || (big.y() % small.y() != 0) || (big.z() % small.z() != 0)))
2547 return false;
2548 return true;
2549 }
2550
iterate(void)2551 tcu::TestStatus DispatchBaseTestInstance::iterate (void)
2552 {
2553 const DeviceInterface& vk = getDeviceInterface();
2554 const VkDevice device = getDevice();
2555 const VkQueue queue = getDeviceQueue(vk, device, m_queueFamilyIndex, 0);
2556 SimpleAllocator allocator (vk, device, getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice()));
2557 deUint32 totalWorkloadSize = 0;
2558
2559 // Create an uniform and input/output buffer
2560 const deUint32 uniformBufSize = 3; // Pass the compute grid size
2561 const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32) * uniformBufSize;
2562 const Buffer uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
2563
2564 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
2565 const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
2566
2567 // Fill the buffers with data
2568 typedef std::vector<deUint32> data_vector_t;
2569 data_vector_t uniformInputData(uniformBufSize);
2570 data_vector_t inputData(m_numValues);
2571
2572 {
2573 const Allocation& bufferAllocation = uniformBuffer.getAllocation();
2574 deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2575 uniformInputData[0] = *bufferPtr++ = m_workSize.x();
2576 uniformInputData[1] = *bufferPtr++ = m_workSize.y();
2577 uniformInputData[2] = *bufferPtr++ = m_workSize.z();
2578 flushAlloc(vk, device, bufferAllocation);
2579 }
2580
2581 {
2582 de::Random rnd(0x82ce7f);
2583 const Allocation& bufferAllocation = buffer.getAllocation();
2584 deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2585 for (deUint32 i = 0; i < m_numValues; ++i)
2586 inputData[i] = *bufferPtr++ = rnd.getUint32();
2587
2588 flushAlloc(vk, device, bufferAllocation);
2589 }
2590
2591 // Create descriptor set
2592 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2593 DescriptorSetLayoutBuilder()
2594 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2595 .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2596 .build(vk, device));
2597
2598 const Unique<VkDescriptorPool> descriptorPool(
2599 DescriptorPoolBuilder()
2600 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2601 .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2602 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2603
2604 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2605
2606 const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
2607 const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2608
2609 DescriptorSetUpdateBuilder()
2610 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2611 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2612 .update(vk, device);
2613
2614 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
2615 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2616 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, static_cast<VkPipelineCreateFlags>(VK_PIPELINE_CREATE_DISPATCH_BASE), *shaderModule, static_cast<VkPipelineShaderStageCreateFlags>(0u)));
2617
2618 const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer, 0ull, bufferSizeBytes);
2619 const VkBufferMemoryBarrier hostUniformWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2620
2621 const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
2622
2623 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, m_queueFamilyIndex));
2624 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2625
2626 // Start recording commands
2627 beginCommandBuffer(vk, *cmdBuffer);
2628
2629 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
2630 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2631
2632 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostUniformWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2633
2634 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2635
2636 // Split the workload across all physical devices based on m_splitWorkSize.x()
2637 for (deUint32 physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2638 {
2639 deUint32 baseGroupX = physDevIdx * m_splitWorkSize.x();
2640 deUint32 baseGroupY = 0;
2641 deUint32 baseGroupZ = 0;
2642
2643 // Split the workload within the physical device based on m_localSize.y() and m_localSize.z()
2644 for (deInt32 localIdxY = 0; localIdxY < (m_splitWorkSize.y() / m_localSize.y()); localIdxY++)
2645 {
2646 for (deInt32 localIdxZ = 0; localIdxZ < (m_splitWorkSize.z() / m_localSize.z()); localIdxZ++)
2647 {
2648 deUint32 offsetX = baseGroupX;
2649 deUint32 offsetY = baseGroupY + localIdxY * m_localSize.y();
2650 deUint32 offsetZ = baseGroupZ + localIdxZ * m_localSize.z();
2651
2652 deUint32 localSizeX = (physDevIdx == (m_numPhysDevices - 1)) ? m_workSize.x() - baseGroupX : m_localSize.x();
2653 deUint32 localSizeY = m_localSize.y();
2654 deUint32 localSizeZ = m_localSize.z();
2655
2656 totalWorkloadSize += (localSizeX * localSizeY * localSizeZ);
2657 vk.cmdDispatchBase(*cmdBuffer, offsetX, offsetY, offsetZ, localSizeX, localSizeY, localSizeZ);
2658 }
2659 }
2660 }
2661
2662 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2663
2664 endCommandBuffer(vk, *cmdBuffer);
2665 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2666
2667 if (totalWorkloadSize != deUint32(multiplyComponents(m_workSize)))
2668 TCU_THROW(TestError, "Not covering the entire workload.");
2669
2670 // Validate the results
2671 const Allocation& bufferAllocation = buffer.getAllocation();
2672 invalidateAlloc(vk, device, bufferAllocation);
2673 const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2674
2675 for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
2676 {
2677 const deUint32 res = bufferPtr[ndx];
2678 const deUint32 ref = ~inputData[ndx];
2679
2680 if (res != ref)
2681 {
2682 std::ostringstream msg;
2683 msg << "Comparison failed for InOut.values[" << ndx << "]";
2684 return tcu::TestStatus::fail(msg.str());
2685 }
2686 }
2687 return tcu::TestStatus::pass("Compute succeeded");
2688 }
2689
2690 class DeviceIndexTest : public vkt::TestCase
2691 {
2692 public:
2693 DeviceIndexTest (tcu::TestContext& testCtx,
2694 const std::string& name,
2695 const std::string& description,
2696 const deUint32 numValues,
2697 const tcu::IVec3& localsize,
2698 const tcu::IVec3& splitsize);
2699
2700 void initPrograms (SourceCollections& sourceCollections) const;
2701 TestInstance* createInstance (Context& context) const;
2702
2703 private:
2704 const deUint32 m_numValues;
2705 const tcu::IVec3 m_localSize;
2706 const tcu::IVec3 m_workSize;
2707 const tcu::IVec3 m_splitSize;
2708 };
2709
2710 class DeviceIndexTestInstance : public ComputeTestInstance
2711 {
2712 public:
2713 DeviceIndexTestInstance (Context& context,
2714 const deUint32 numValues,
2715 const tcu::IVec3& localsize,
2716 const tcu::IVec3& worksize);
2717 tcu::TestStatus iterate (void);
2718 private:
2719 const deUint32 m_numValues;
2720 const tcu::IVec3 m_localSize;
2721 tcu::IVec3 m_workSize;
2722 };
2723
DeviceIndexTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const deUint32 numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize)2724 DeviceIndexTest::DeviceIndexTest (tcu::TestContext& testCtx,
2725 const std::string& name,
2726 const std::string& description,
2727 const deUint32 numValues,
2728 const tcu::IVec3& localsize,
2729 const tcu::IVec3& worksize)
2730 : TestCase (testCtx, name, description)
2731 , m_numValues (numValues)
2732 , m_localSize (localsize)
2733 , m_workSize (worksize)
2734 {
2735 }
2736
initPrograms(SourceCollections & sourceCollections) const2737 void DeviceIndexTest::initPrograms (SourceCollections& sourceCollections) const
2738 {
2739 std::ostringstream src;
2740 src << "#version 310 es\n"
2741 << "#extension GL_EXT_device_group : require\n"
2742 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
2743
2744 << "layout(binding = 0) buffer InOut {\n"
2745 << " uint values[" << de::toString(m_numValues) << "];\n"
2746 << "} sb_inout;\n"
2747
2748 << "layout(binding = 1) readonly uniform uniformInput {\n"
2749 << " uint baseOffset[1+" << VK_MAX_DEVICE_GROUP_SIZE_KHR << "];\n"
2750 << "} ubo_in;\n"
2751
2752 << "void main (void) {\n"
2753 << " uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
2754 << " uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
2755 << " uint index = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
2756 << " uint offset = numValuesPerInv*index;\n"
2757 << " for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
2758 << " sb_inout.values[offset + ndx] = ubo_in.baseOffset[0] + ubo_in.baseOffset[gl_DeviceIndex + 1];\n"
2759 << "}\n";
2760
2761 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
2762 }
2763
createInstance(Context & context) const2764 TestInstance* DeviceIndexTest::createInstance (Context& context) const
2765 {
2766 return new DeviceIndexTestInstance(context, m_numValues, m_localSize, m_workSize);
2767 }
2768
DeviceIndexTestInstance(Context & context,const deUint32 numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize)2769 DeviceIndexTestInstance::DeviceIndexTestInstance (Context& context,
2770 const deUint32 numValues,
2771 const tcu::IVec3& localsize,
2772 const tcu::IVec3& worksize)
2773
2774 : ComputeTestInstance (context)
2775 , m_numValues (numValues)
2776 , m_localSize (localsize)
2777 , m_workSize (worksize)
2778 {}
2779
iterate(void)2780 tcu::TestStatus DeviceIndexTestInstance::iterate (void)
2781 {
2782 const DeviceInterface& vk = getDeviceInterface();
2783 const VkDevice device = getDevice();
2784 const VkQueue queue = getDeviceQueue(vk, device, m_queueFamilyIndex, 0);
2785 SimpleAllocator allocator (vk, device, getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice()));
2786 const deUint32 allocDeviceMask = (1 << m_numPhysDevices) - 1;
2787 de::Random rnd (0x82ce7f);
2788 Move<VkBuffer> sboBuffer;
2789 vk::Move<vk::VkDeviceMemory> sboBufferMemory;
2790
2791 // Create an uniform and output buffer
2792 const deUint32 uniformBufSize = 4 * (1 + VK_MAX_DEVICE_GROUP_SIZE_KHR);
2793 const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32) * uniformBufSize;
2794 const Buffer uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
2795
2796 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
2797 const Buffer checkBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT), MemoryRequirement::HostVisible);
2798
2799 // create SBO buffer
2800 {
2801 const VkBufferCreateInfo sboBufferParams =
2802 {
2803 VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, // sType
2804 DE_NULL, // pNext
2805 0u, // flags
2806 (VkDeviceSize)bufferSizeBytes, // size
2807 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT, // usage
2808 VK_SHARING_MODE_EXCLUSIVE, // sharingMode
2809 1u, // queueFamilyIndexCount
2810 &m_queueFamilyIndex, // pQueueFamilyIndices
2811 };
2812 sboBuffer = createBuffer(vk, device, &sboBufferParams);
2813
2814 VkMemoryRequirements memReqs = getBufferMemoryRequirements(vk, device, sboBuffer.get());
2815 deUint32 memoryTypeNdx = 0;
2816 const VkPhysicalDeviceMemoryProperties deviceMemProps = getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice());
2817 for ( memoryTypeNdx = 0; memoryTypeNdx < deviceMemProps.memoryTypeCount; memoryTypeNdx++)
2818 {
2819 if ((memReqs.memoryTypeBits & (1u << memoryTypeNdx)) != 0 &&
2820 (deviceMemProps.memoryTypes[memoryTypeNdx].propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) == VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)
2821 break;
2822 }
2823 if (memoryTypeNdx == deviceMemProps.memoryTypeCount)
2824 TCU_THROW(NotSupportedError, "No compatible memory type found");
2825
2826 const VkMemoryAllocateFlagsInfo allocDeviceMaskInfo =
2827 {
2828 VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO_KHR, // sType
2829 DE_NULL, // pNext
2830 VK_MEMORY_ALLOCATE_DEVICE_MASK_BIT, // flags
2831 allocDeviceMask, // deviceMask
2832 };
2833
2834 VkMemoryAllocateInfo allocInfo =
2835 {
2836 VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, // sType
2837 &allocDeviceMaskInfo, // pNext
2838 memReqs.size, // allocationSize
2839 memoryTypeNdx, // memoryTypeIndex
2840 };
2841
2842 sboBufferMemory = allocateMemory(vk, device, &allocInfo);
2843 VK_CHECK(vk.bindBufferMemory(device, *sboBuffer, sboBufferMemory.get(), 0));
2844 }
2845
2846 // Fill the buffers with data
2847 typedef std::vector<deUint32> data_vector_t;
2848 data_vector_t uniformInputData(uniformBufSize, 0);
2849
2850 {
2851 const Allocation& bufferAllocation = uniformBuffer.getAllocation();
2852 deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2853 for (deUint32 i = 0; i < uniformBufSize; ++i)
2854 uniformInputData[i] = *bufferPtr++ = rnd.getUint32() / 10; // divide to prevent overflow in addition
2855
2856 flushAlloc(vk, device, bufferAllocation);
2857 }
2858
2859 // Create descriptor set
2860 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2861 DescriptorSetLayoutBuilder()
2862 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2863 .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2864 .build(vk, device));
2865
2866 const Unique<VkDescriptorPool> descriptorPool(
2867 DescriptorPoolBuilder()
2868 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2869 .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2870 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2871
2872 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2873
2874 const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*sboBuffer, 0ull, bufferSizeBytes);
2875 const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2876
2877 DescriptorSetUpdateBuilder()
2878 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2879 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2880 .update(vk, device);
2881
2882 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
2883 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2884 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
2885
2886 const VkBufferMemoryBarrier hostUniformWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2887 const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT , *sboBuffer, 0ull, bufferSizeBytes);
2888
2889 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, m_queueFamilyIndex));
2890 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2891
2892 // Verify multiple device masks
2893 for (deUint32 physDevMask = 1; physDevMask < (1u << m_numPhysDevices); physDevMask++)
2894 {
2895 deUint32 constantValPerLoop = 0;
2896 {
2897 const Allocation& bufferAllocation = uniformBuffer.getAllocation();
2898 deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2899 constantValPerLoop = *bufferPtr = rnd.getUint32() / 10; // divide to prevent overflow in addition
2900 flushAlloc(vk, device, bufferAllocation);
2901 }
2902 beginCommandBuffer(vk, *cmdBuffer);
2903
2904 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
2905 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2906 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostUniformWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2907
2908 vk.cmdSetDeviceMask(*cmdBuffer, physDevMask);
2909 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
2910
2911 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2912
2913 endCommandBuffer(vk, *cmdBuffer);
2914 submitCommandsAndWait(vk, device, queue, *cmdBuffer, true, physDevMask);
2915
2916 // Validate the results on all physical devices where compute shader was launched
2917 const VkBufferMemoryBarrier srcBufferBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT , *sboBuffer, 0ull, bufferSizeBytes);
2918 const VkBufferMemoryBarrier dstBufferBarrier = makeBufferMemoryBarrier(VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *checkBuffer, 0ull, bufferSizeBytes);
2919 const VkBufferCopy copyParams =
2920 {
2921 (VkDeviceSize)0u, // srcOffset
2922 (VkDeviceSize)0u, // dstOffset
2923 bufferSizeBytes // size
2924 };
2925
2926 for (deUint32 physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2927 {
2928 if (!(1<<physDevIdx & physDevMask))
2929 continue;
2930
2931 const deUint32 deviceMask = 1 << physDevIdx;
2932
2933 beginCommandBuffer(vk, *cmdBuffer);
2934 vk.cmdSetDeviceMask(*cmdBuffer, deviceMask);
2935 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT , VK_PIPELINE_STAGE_TRANSFER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &srcBufferBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2936 vk.cmdCopyBuffer(*cmdBuffer, *sboBuffer, *checkBuffer, 1, ©Params);
2937 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &dstBufferBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2938
2939 endCommandBuffer(vk, *cmdBuffer);
2940 submitCommandsAndWait(vk, device, queue, *cmdBuffer, true, deviceMask);
2941
2942 const Allocation& bufferAllocation = checkBuffer.getAllocation();
2943 invalidateAlloc(vk, device, bufferAllocation);
2944 const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2945
2946 for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
2947 {
2948 const deUint32 res = bufferPtr[ndx];
2949 const deUint32 ref = constantValPerLoop + uniformInputData[4 * (physDevIdx + 1)];
2950
2951 if (res != ref)
2952 {
2953 std::ostringstream msg;
2954 msg << "Comparison failed on physical device "<< getPhysicalDevice(physDevIdx) <<" ( deviceMask "<< deviceMask <<" ) for InOut.values[" << ndx << "]";
2955 return tcu::TestStatus::fail(msg.str());
2956 }
2957 }
2958 }
2959 }
2960
2961 return tcu::TestStatus::pass("Compute succeeded");
2962 }
2963
2964 class ConcurrentCompute : public vkt::TestCase
2965 {
2966 public:
2967 ConcurrentCompute (tcu::TestContext& testCtx,
2968 const std::string& name,
2969 const std::string& description);
2970
2971
2972 void initPrograms (SourceCollections& sourceCollections) const;
2973 TestInstance* createInstance (Context& context) const;
2974 };
2975
2976 class ConcurrentComputeInstance : public vkt::TestInstance
2977 {
2978 public:
2979 ConcurrentComputeInstance (Context& context);
2980
2981 tcu::TestStatus iterate (void);
2982 };
2983
ConcurrentCompute(tcu::TestContext & testCtx,const std::string & name,const std::string & description)2984 ConcurrentCompute::ConcurrentCompute (tcu::TestContext& testCtx,
2985 const std::string& name,
2986 const std::string& description)
2987 : TestCase (testCtx, name, description)
2988 {
2989 }
2990
initPrograms(SourceCollections & sourceCollections) const2991 void ConcurrentCompute::initPrograms (SourceCollections& sourceCollections) const
2992 {
2993 std::ostringstream src;
2994 src << "#version 310 es\n"
2995 << "layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;\n"
2996 << "layout(binding = 0) buffer InOut {\n"
2997 << " uint values[1024];\n"
2998 << "} sb_inout;\n"
2999 << "void main (void) {\n"
3000 << " uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
3001 << " uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
3002 << " uint groupNdx = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
3003 << " uint offset = numValuesPerInv*groupNdx;\n"
3004 << "\n"
3005 << " for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
3006 << " sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
3007 << "}\n";
3008
3009 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
3010 }
3011
createInstance(Context & context) const3012 TestInstance* ConcurrentCompute::createInstance (Context& context) const
3013 {
3014 return new ConcurrentComputeInstance(context);
3015 }
3016
ConcurrentComputeInstance(Context & context)3017 ConcurrentComputeInstance::ConcurrentComputeInstance (Context& context)
3018 : TestInstance (context)
3019 {
3020 }
3021
iterate(void)3022 tcu::TestStatus ConcurrentComputeInstance::iterate (void)
3023 {
3024 enum {
3025 NO_MATCH_FOUND = ~((deUint32)0),
3026 ERROR_NONE = 0,
3027 ERROR_WAIT = 1,
3028 ERROR_ORDER = 2
3029 };
3030
3031 struct Queues
3032 {
3033 VkQueue queue;
3034 deUint32 queueFamilyIndex;
3035 };
3036
3037 const DeviceInterface& vk = m_context.getDeviceInterface();
3038 const deUint32 numValues = 1024;
3039 const InstanceInterface& instance = m_context.getInstanceInterface();
3040 const VkPhysicalDevice physicalDevice = m_context.getPhysicalDevice();
3041 tcu::TestLog& log = m_context.getTestContext().getLog();
3042 vk::Move<vk::VkDevice> logicalDevice;
3043 std::vector<VkQueueFamilyProperties> queueFamilyProperties;
3044 VkDeviceCreateInfo deviceInfo;
3045 VkPhysicalDeviceFeatures deviceFeatures;
3046 const float queuePriorities[2] = {1.0f, 0.0f};
3047 VkDeviceQueueCreateInfo queueInfos[2];
3048 Queues queues[2] =
3049 {
3050 {DE_NULL, (deUint32)NO_MATCH_FOUND},
3051 {DE_NULL, (deUint32)NO_MATCH_FOUND}
3052 };
3053
3054 queueFamilyProperties = getPhysicalDeviceQueueFamilyProperties(instance, physicalDevice);
3055
3056 for (deUint32 queueNdx = 0; queueNdx < queueFamilyProperties.size(); ++queueNdx)
3057 {
3058 if (queueFamilyProperties[queueNdx].queueFlags & VK_QUEUE_COMPUTE_BIT)
3059 {
3060 if (NO_MATCH_FOUND == queues[0].queueFamilyIndex)
3061 queues[0].queueFamilyIndex = queueNdx;
3062
3063 if (queues[0].queueFamilyIndex != queueNdx || queueFamilyProperties[queueNdx].queueCount > 1u)
3064 {
3065 queues[1].queueFamilyIndex = queueNdx;
3066 break;
3067 }
3068 }
3069 }
3070
3071 if (queues[0].queueFamilyIndex == NO_MATCH_FOUND || queues[1].queueFamilyIndex == NO_MATCH_FOUND)
3072 TCU_THROW(NotSupportedError, "Queues couldn't be created");
3073
3074 for (int queueNdx = 0; queueNdx < 2; ++queueNdx)
3075 {
3076 VkDeviceQueueCreateInfo queueInfo;
3077 deMemset(&queueInfo, 0, sizeof(queueInfo));
3078
3079 queueInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
3080 queueInfo.pNext = DE_NULL;
3081 queueInfo.flags = (VkDeviceQueueCreateFlags)0u;
3082 queueInfo.queueFamilyIndex = queues[queueNdx].queueFamilyIndex;
3083 queueInfo.queueCount = (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex) ? 2 : 1;
3084 queueInfo.pQueuePriorities = (queueInfo.queueCount == 2) ? queuePriorities : &queuePriorities[queueNdx];
3085
3086 queueInfos[queueNdx] = queueInfo;
3087
3088 if (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex)
3089 break;
3090 }
3091 deMemset(&deviceInfo, 0, sizeof(deviceInfo));
3092 instance.getPhysicalDeviceFeatures(physicalDevice, &deviceFeatures);
3093
3094 deviceInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
3095 deviceInfo.pNext = DE_NULL;
3096 deviceInfo.enabledExtensionCount = 0u;
3097 deviceInfo.ppEnabledExtensionNames = DE_NULL;
3098 deviceInfo.enabledLayerCount = 0u;
3099 deviceInfo.ppEnabledLayerNames = DE_NULL;
3100 deviceInfo.pEnabledFeatures = &deviceFeatures;
3101 deviceInfo.queueCreateInfoCount = (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex) ? 1 : 2;
3102 deviceInfo.pQueueCreateInfos = queueInfos;
3103
3104 logicalDevice = createCustomDevice(m_context.getTestContext().getCommandLine().isValidationEnabled(), m_context.getPlatformInterface(), m_context.getInstance(), instance, physicalDevice, &deviceInfo);
3105
3106 for (deUint32 queueReqNdx = 0; queueReqNdx < 2; ++queueReqNdx)
3107 {
3108 if (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex)
3109 vk.getDeviceQueue(*logicalDevice, queues[queueReqNdx].queueFamilyIndex, queueReqNdx, &queues[queueReqNdx].queue);
3110 else
3111 vk.getDeviceQueue(*logicalDevice, queues[queueReqNdx].queueFamilyIndex, 0u, &queues[queueReqNdx].queue);
3112 }
3113
3114 // Create an input/output buffers
3115 const VkPhysicalDeviceMemoryProperties memoryProperties = vk::getPhysicalDeviceMemoryProperties(instance, physicalDevice);
3116
3117 SimpleAllocator *allocator = new SimpleAllocator(vk, *logicalDevice, memoryProperties);
3118 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * numValues;
3119 const Buffer buffer1(vk, *logicalDevice, *allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
3120 const Buffer buffer2(vk, *logicalDevice, *allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
3121
3122 // Fill the buffers with data
3123
3124 typedef std::vector<deUint32> data_vector_t;
3125 data_vector_t inputData(numValues);
3126
3127 {
3128 de::Random rnd(0x82ce7f);
3129 const Allocation& bufferAllocation1 = buffer1.getAllocation();
3130 const Allocation& bufferAllocation2 = buffer2.getAllocation();
3131 deUint32* bufferPtr1 = static_cast<deUint32*>(bufferAllocation1.getHostPtr());
3132 deUint32* bufferPtr2 = static_cast<deUint32*>(bufferAllocation2.getHostPtr());
3133
3134 for (deUint32 i = 0; i < numValues; ++i)
3135 {
3136 deUint32 val = rnd.getUint32();
3137 inputData[i] = val;
3138 *bufferPtr1++ = val;
3139 *bufferPtr2++ = val;
3140 }
3141
3142 flushAlloc(vk, *logicalDevice, bufferAllocation1);
3143 flushAlloc(vk, *logicalDevice, bufferAllocation2);
3144 }
3145
3146 // Create descriptor sets
3147
3148 const Unique<VkDescriptorSetLayout> descriptorSetLayout1(
3149 DescriptorSetLayoutBuilder()
3150 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3151 .build(vk, *logicalDevice));
3152
3153 const Unique<VkDescriptorPool> descriptorPool1(
3154 DescriptorPoolBuilder()
3155 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
3156 .build(vk, *logicalDevice, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
3157
3158 const Unique<VkDescriptorSet> descriptorSet1(makeDescriptorSet(vk, *logicalDevice, *descriptorPool1, *descriptorSetLayout1));
3159
3160 const VkDescriptorBufferInfo bufferDescriptorInfo1 = makeDescriptorBufferInfo(*buffer1, 0ull, bufferSizeBytes);
3161 DescriptorSetUpdateBuilder()
3162 .writeSingle(*descriptorSet1, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo1)
3163 .update(vk, *logicalDevice);
3164
3165 const Unique<VkDescriptorSetLayout> descriptorSetLayout2(
3166 DescriptorSetLayoutBuilder()
3167 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3168 .build(vk, *logicalDevice));
3169
3170 const Unique<VkDescriptorPool> descriptorPool2(
3171 DescriptorPoolBuilder()
3172 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
3173 .build(vk, *logicalDevice, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
3174
3175 const Unique<VkDescriptorSet> descriptorSet2(makeDescriptorSet(vk, *logicalDevice, *descriptorPool2, *descriptorSetLayout2));
3176
3177 const VkDescriptorBufferInfo bufferDescriptorInfo2 = makeDescriptorBufferInfo(*buffer2, 0ull, bufferSizeBytes);
3178 DescriptorSetUpdateBuilder()
3179 .writeSingle(*descriptorSet2, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo2)
3180 .update(vk, *logicalDevice);
3181
3182 // Perform the computation
3183
3184 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, *logicalDevice, m_context.getBinaryCollection().get("comp"), 0u));
3185
3186 const Unique<VkPipelineLayout> pipelineLayout1(makePipelineLayout(vk, *logicalDevice, *descriptorSetLayout1));
3187 const Unique<VkPipeline> pipeline1(makeComputePipeline(vk, *logicalDevice, *pipelineLayout1, *shaderModule));
3188 const VkBufferMemoryBarrier hostWriteBarrier1 = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer1, 0ull, bufferSizeBytes);
3189 const VkBufferMemoryBarrier shaderWriteBarrier1 = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer1, 0ull, bufferSizeBytes);
3190 const Unique<VkCommandPool> cmdPool1(makeCommandPool(vk, *logicalDevice, queues[0].queueFamilyIndex));
3191 const Unique<VkCommandBuffer> cmdBuffer1(allocateCommandBuffer(vk, *logicalDevice, *cmdPool1, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3192
3193 const Unique<VkPipelineLayout> pipelineLayout2(makePipelineLayout(vk, *logicalDevice, *descriptorSetLayout2));
3194 const Unique<VkPipeline> pipeline2(makeComputePipeline(vk, *logicalDevice, *pipelineLayout2, *shaderModule));
3195 const VkBufferMemoryBarrier hostWriteBarrier2 = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer2, 0ull, bufferSizeBytes);
3196 const VkBufferMemoryBarrier shaderWriteBarrier2 = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer2, 0ull, bufferSizeBytes);
3197 const Unique<VkCommandPool> cmdPool2(makeCommandPool(vk, *logicalDevice, queues[1].queueFamilyIndex));
3198 const Unique<VkCommandBuffer> cmdBuffer2(allocateCommandBuffer(vk, *logicalDevice, *cmdPool2, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3199
3200 // Command buffer 1
3201
3202 beginCommandBuffer(vk, *cmdBuffer1);
3203 vk.cmdBindPipeline(*cmdBuffer1, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline1);
3204 vk.cmdBindDescriptorSets(*cmdBuffer1, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout1, 0u, 1u, &descriptorSet1.get(), 0u, DE_NULL);
3205 vk.cmdPipelineBarrier(*cmdBuffer1, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier1, 0, (const VkImageMemoryBarrier*)DE_NULL);
3206 vk.cmdDispatch(*cmdBuffer1, 1, 1, 1);
3207 vk.cmdPipelineBarrier(*cmdBuffer1, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier1, 0, (const VkImageMemoryBarrier*)DE_NULL);
3208 endCommandBuffer(vk, *cmdBuffer1);
3209
3210 // Command buffer 2
3211
3212 beginCommandBuffer(vk, *cmdBuffer2);
3213 vk.cmdBindPipeline(*cmdBuffer2, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline2);
3214 vk.cmdBindDescriptorSets(*cmdBuffer2, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout2, 0u, 1u, &descriptorSet2.get(), 0u, DE_NULL);
3215 vk.cmdPipelineBarrier(*cmdBuffer2, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier2, 0, (const VkImageMemoryBarrier*)DE_NULL);
3216 vk.cmdDispatch(*cmdBuffer2, 1, 1, 1);
3217 vk.cmdPipelineBarrier(*cmdBuffer2, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier2, 0, (const VkImageMemoryBarrier*)DE_NULL);
3218 endCommandBuffer(vk, *cmdBuffer2);
3219
3220 VkSubmitInfo submitInfo1 =
3221 {
3222 VK_STRUCTURE_TYPE_SUBMIT_INFO, // sType
3223 DE_NULL, // pNext
3224 0u, // waitSemaphoreCount
3225 DE_NULL, // pWaitSemaphores
3226 (const VkPipelineStageFlags*)DE_NULL, // pWaitDstStageMask
3227 1u, // commandBufferCount
3228 &cmdBuffer1.get(), // pCommandBuffers
3229 0u, // signalSemaphoreCount
3230 DE_NULL // pSignalSemaphores
3231 };
3232
3233 VkSubmitInfo submitInfo2 =
3234 {
3235 VK_STRUCTURE_TYPE_SUBMIT_INFO, // sType
3236 DE_NULL, // pNext
3237 0u, // waitSemaphoreCount
3238 DE_NULL, // pWaitSemaphores
3239 (const VkPipelineStageFlags*)DE_NULL, // pWaitDstStageMask
3240 1u, // commandBufferCount
3241 &cmdBuffer2.get(), // pCommandBuffers
3242 0u, // signalSemaphoreCount
3243 DE_NULL // pSignalSemaphores
3244 };
3245
3246 // Wait for completion
3247 const Unique<VkFence> fence1(createFence(vk, *logicalDevice));
3248 const Unique<VkFence> fence2(createFence(vk, *logicalDevice));
3249
3250 VK_CHECK(vk.queueSubmit(queues[0].queue, 1u, &submitInfo1, *fence1));
3251 VK_CHECK(vk.queueSubmit(queues[1].queue, 1u, &submitInfo2, *fence2));
3252
3253 int err = ERROR_NONE;
3254
3255 // First wait for the low-priority queue
3256 if (VK_SUCCESS != vk.waitForFences(*logicalDevice, 1u, &fence2.get(), DE_TRUE, ~0ull))
3257 err = ERROR_WAIT;
3258
3259 // If the high-priority queue hasn't finished, we have a problem.
3260 if (VK_SUCCESS != vk.getFenceStatus(*logicalDevice, fence1.get()))
3261 if (err == ERROR_NONE)
3262 err = ERROR_ORDER;
3263
3264 // Wait for the high-priority fence so we don't get errors on teardown.
3265 vk.waitForFences(*logicalDevice, 1u, &fence1.get(), DE_TRUE, ~0ull);
3266
3267 // If we fail() before waiting for all of the fences, error will come from
3268 // teardown instead of the error we want.
3269
3270 if (err == ERROR_WAIT)
3271 return tcu::TestStatus::fail("Failed waiting for low-priority queue fence.");
3272
3273 // Validate the results
3274
3275 const Allocation& bufferAllocation1 = buffer1.getAllocation();
3276 invalidateAlloc(vk, *logicalDevice, bufferAllocation1);
3277 const deUint32* bufferPtr1 = static_cast<deUint32*>(bufferAllocation1.getHostPtr());
3278
3279 const Allocation& bufferAllocation2 = buffer2.getAllocation();
3280 invalidateAlloc(vk, *logicalDevice, bufferAllocation2);
3281 const deUint32* bufferPtr2 = static_cast<deUint32*>(bufferAllocation2.getHostPtr());
3282
3283 for (deUint32 ndx = 0; ndx < numValues; ++ndx)
3284 {
3285 const deUint32 res1 = bufferPtr1[ndx];
3286 const deUint32 res2 = bufferPtr2[ndx];
3287 const deUint32 inp = inputData[ndx];
3288 const deUint32 ref = ~inp;
3289
3290 if (res1 != ref || res1 != res2)
3291 {
3292 std::ostringstream msg;
3293 msg << "Comparison failed for InOut.values[" << ndx << "] ref:" << ref <<" res1:" << res1 << " res2:" << res2 << " inp:" << inp;
3294 return tcu::TestStatus::fail(msg.str());
3295 }
3296 }
3297
3298 if (err == ERROR_ORDER)
3299 log << tcu::TestLog::Message << "Note: Low-priority queue was faster than high-priority one. This is not an error, but priorities may be inverted." << tcu::TestLog::EndMessage;
3300
3301 return tcu::TestStatus::pass("Test passed");
3302 }
3303
3304 class EmptyWorkGroupCase : public vkt::TestCase
3305 {
3306 public:
3307 EmptyWorkGroupCase (tcu::TestContext& testCtx, const std::string& name, const std::string& description, const tcu::UVec3& dispatchSize);
~EmptyWorkGroupCase(void)3308 virtual ~EmptyWorkGroupCase (void) {}
3309
3310 TestInstance* createInstance (Context& context) const override;
3311 void initPrograms (vk::SourceCollections& programCollection) const override;
3312
3313 protected:
3314 const tcu::UVec3 m_dispatchSize;
3315 };
3316
3317 class EmptyWorkGroupInstance : public vkt::TestInstance
3318 {
3319 public:
EmptyWorkGroupInstance(Context & context,const tcu::UVec3 & dispatchSize)3320 EmptyWorkGroupInstance (Context& context, const tcu::UVec3& dispatchSize)
3321 : vkt::TestInstance (context)
3322 , m_dispatchSize (dispatchSize)
3323 {}
~EmptyWorkGroupInstance(void)3324 virtual ~EmptyWorkGroupInstance (void) {}
3325
3326 tcu::TestStatus iterate (void) override;
3327
3328 protected:
3329 const tcu::UVec3 m_dispatchSize;
3330 };
3331
EmptyWorkGroupCase(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const tcu::UVec3 & dispatchSize)3332 EmptyWorkGroupCase::EmptyWorkGroupCase (tcu::TestContext& testCtx, const std::string& name, const std::string& description, const tcu::UVec3& dispatchSize)
3333 : vkt::TestCase (testCtx, name, description)
3334 , m_dispatchSize (dispatchSize)
3335 {
3336 DE_ASSERT(m_dispatchSize.x() == 0u || m_dispatchSize.y() == 0u || m_dispatchSize.z() == 0u);
3337 }
3338
createInstance(Context & context) const3339 TestInstance* EmptyWorkGroupCase::createInstance (Context& context) const
3340 {
3341 return new EmptyWorkGroupInstance(context, m_dispatchSize);
3342 }
3343
initPrograms(vk::SourceCollections & programCollection) const3344 void EmptyWorkGroupCase::initPrograms (vk::SourceCollections& programCollection) const
3345 {
3346 std::ostringstream comp;
3347 comp
3348 << "#version 450\n"
3349 << "layout (local_size_x=1, local_size_y=1, local_size_z=1) in;\n"
3350 << "layout (set=0, binding=0) buffer VerificationBlock { uint value; } verif;\n"
3351 << "void main () { atomicAdd(verif.value, 1u); }\n"
3352 ;
3353 programCollection.glslSources.add("comp") << glu::ComputeSource(comp.str());
3354 }
3355
iterate(void)3356 tcu::TestStatus EmptyWorkGroupInstance::iterate (void)
3357 {
3358 const auto& vkd = m_context.getDeviceInterface();
3359 const auto device = m_context.getDevice();
3360 auto& alloc = m_context.getDefaultAllocator();
3361 const auto queueIndex = m_context.getUniversalQueueFamilyIndex();
3362 const auto queue = m_context.getUniversalQueue();
3363
3364 const auto verifBufferSize = static_cast<VkDeviceSize>(sizeof(uint32_t));
3365 const auto verifBufferInfo = makeBufferCreateInfo(verifBufferSize, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
3366 BufferWithMemory verifBuffer (vkd, device, alloc, verifBufferInfo, MemoryRequirement::HostVisible);
3367 auto& verifBufferAlloc = verifBuffer.getAllocation();
3368 void* verifBufferPtr = verifBufferAlloc.getHostPtr();
3369
3370 deMemset(verifBufferPtr, 0, static_cast<size_t>(verifBufferSize));
3371 flushAlloc(vkd, device, verifBufferAlloc);
3372
3373 DescriptorSetLayoutBuilder layoutBuilder;
3374 layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT);
3375 const auto descriptorSetLayout = layoutBuilder.build(vkd, device);
3376
3377 const auto pipelineLayout = makePipelineLayout(vkd, device, descriptorSetLayout.get());
3378 const auto shaderModule = createShaderModule(vkd, device, m_context.getBinaryCollection().get("comp"), 0u);
3379 const auto pipeline = makeComputePipeline(vkd, device, pipelineLayout.get(), shaderModule.get());
3380
3381 DescriptorPoolBuilder poolBuilder;
3382 poolBuilder.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
3383 const auto descriptorPool = poolBuilder.build(vkd, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u);
3384 const auto descriptorSet = makeDescriptorSet(vkd, device, descriptorPool.get(), descriptorSetLayout.get());
3385
3386 DescriptorSetUpdateBuilder updateBuilder;
3387 const auto verifBufferDescInfo = makeDescriptorBufferInfo(verifBuffer.get(), 0ull, verifBufferSize);
3388 updateBuilder.writeSingle(descriptorSet.get(), DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &verifBufferDescInfo);
3389 updateBuilder.update(vkd, device);
3390
3391 const auto cmdPool = makeCommandPool(vkd, device, queueIndex);
3392 const auto cmdBufferPtr = allocateCommandBuffer(vkd, device, cmdPool.get(), VK_COMMAND_BUFFER_LEVEL_PRIMARY);
3393 const auto cmdBuffer = cmdBufferPtr.get();
3394
3395 beginCommandBuffer(vkd, cmdBuffer);
3396 vkd.cmdBindPipeline(cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.get());
3397 vkd.cmdBindDescriptorSets(cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayout.get(), 0u, 1u, &descriptorSet.get(), 0u, nullptr);
3398 vkd.cmdDispatch(cmdBuffer, m_dispatchSize.x(), m_dispatchSize.y(), m_dispatchSize.z());
3399
3400 const auto readWriteAccess = (VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_SHADER_READ_BIT);
3401 const auto computeToCompute = makeMemoryBarrier(readWriteAccess, readWriteAccess);
3402 vkd.cmdPipelineBarrier(cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0U, 1u, &computeToCompute, 0u, nullptr, 0u, nullptr);
3403
3404 vkd.cmdDispatch(cmdBuffer, 1u, 1u, 1u);
3405
3406 const auto computeToHost = makeMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT);
3407 vkd.cmdPipelineBarrier(cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0u, 1u, &computeToHost, 0u, nullptr, 0u, nullptr);
3408
3409 endCommandBuffer(vkd, cmdBuffer);
3410 submitCommandsAndWait(vkd, device, queue, cmdBuffer);
3411
3412 uint32_t value;
3413 invalidateAlloc(vkd, device, verifBufferAlloc);
3414 deMemcpy(&value, verifBufferPtr, sizeof(value));
3415
3416 if (value != 1u)
3417 {
3418 std::ostringstream msg;
3419 msg << "Unexpected value found in buffer: " << value << " while expecting 1";
3420 TCU_FAIL(msg.str());
3421 }
3422
3423 return tcu::TestStatus::pass("Pass");
3424 }
3425
3426 class MaxWorkGroupSizeTest : public vkt::TestCase
3427 {
3428 public:
3429 enum class Axis { X = 0, Y = 1, Z = 2 };
3430
3431 struct Params
3432 {
3433 // Which axis to maximize.
3434 Axis axis;
3435 };
3436
3437 MaxWorkGroupSizeTest (tcu::TestContext& testCtx, const std::string& name, const std::string& description, const Params& params);
~MaxWorkGroupSizeTest(void)3438 virtual ~MaxWorkGroupSizeTest (void) {}
3439
3440 virtual void initPrograms (vk::SourceCollections& programCollection) const;
3441 virtual TestInstance* createInstance (Context& context) const;
3442 virtual void checkSupport (Context& context) const;
3443
3444 // Helper to transform the axis value to an index.
3445 static int getIndex (Axis axis);
3446
3447 // Helper returning the number of invocations according to the test parameters.
3448 static deUint32 getInvocations (const Params& params, const vk::InstanceInterface& vki, vk::VkPhysicalDevice physicalDevice, const vk::VkPhysicalDeviceProperties* devProperties = nullptr);
3449
3450 // Helper returning the buffer size needed to this test.
3451 static deUint32 getSSBOSize (deUint32 invocations);
3452
3453 private:
3454 Params m_params;
3455 };
3456
3457 class MaxWorkGroupSizeInstance : public vkt::TestInstance
3458 {
3459 public:
3460 MaxWorkGroupSizeInstance (Context& context, const MaxWorkGroupSizeTest::Params& params);
~MaxWorkGroupSizeInstance(void)3461 virtual ~MaxWorkGroupSizeInstance (void) {}
3462
3463 virtual tcu::TestStatus iterate (void);
3464
3465 private:
3466 MaxWorkGroupSizeTest::Params m_params;
3467 };
3468
getIndex(Axis axis)3469 int MaxWorkGroupSizeTest::getIndex (Axis axis)
3470 {
3471 const int ret = static_cast<int>(axis);
3472 DE_ASSERT(ret >= static_cast<int>(Axis::X) && ret <= static_cast<int>(Axis::Z));
3473 return ret;
3474 }
3475
getInvocations(const Params & params,const vk::InstanceInterface & vki,vk::VkPhysicalDevice physicalDevice,const vk::VkPhysicalDeviceProperties * devProperties)3476 deUint32 MaxWorkGroupSizeTest::getInvocations (const Params& params, const vk::InstanceInterface& vki, vk::VkPhysicalDevice physicalDevice, const vk::VkPhysicalDeviceProperties* devProperties)
3477 {
3478 const auto axis = getIndex(params.axis);
3479
3480 if (devProperties)
3481 return devProperties->limits.maxComputeWorkGroupSize[axis];
3482 return vk::getPhysicalDeviceProperties(vki, physicalDevice).limits.maxComputeWorkGroupSize[axis];
3483 }
3484
getSSBOSize(deUint32 invocations)3485 deUint32 MaxWorkGroupSizeTest::getSSBOSize (deUint32 invocations)
3486 {
3487 return invocations * static_cast<deUint32>(sizeof(deUint32));
3488 }
3489
MaxWorkGroupSizeTest(tcu::TestContext & testCtx,const std::string & name,const std::string & description,const Params & params)3490 MaxWorkGroupSizeTest::MaxWorkGroupSizeTest (tcu::TestContext& testCtx, const std::string& name, const std::string& description, const Params& params)
3491 : vkt::TestCase (testCtx, name, description)
3492 , m_params (params)
3493 {}
3494
initPrograms(vk::SourceCollections & programCollection) const3495 void MaxWorkGroupSizeTest::initPrograms (vk::SourceCollections& programCollection) const
3496 {
3497 std::ostringstream shader;
3498
3499 // The actual local sizes will be set using spec constants when running the test instance.
3500 shader
3501 << "#version 450\n"
3502 << "\n"
3503 << "layout(constant_id=0) const int local_size_x_val = 1;\n"
3504 << "layout(constant_id=1) const int local_size_y_val = 1;\n"
3505 << "layout(constant_id=2) const int local_size_z_val = 1;\n"
3506 << "\n"
3507 << "layout(local_size_x_id=0, local_size_y_id=1, local_size_z_id=2) in;\n"
3508 << "\n"
3509 << "layout(set=0, binding=0) buffer StorageBuffer {\n"
3510 << " uint values[];\n"
3511 << "} ssbo;\n"
3512 << "\n"
3513 << "void main() {\n"
3514 << " ssbo.values[gl_LocalInvocationIndex] = 1u;\n"
3515 << "}\n"
3516 ;
3517
3518 programCollection.glslSources.add("comp") << glu::ComputeSource(shader.str());
3519 }
3520
createInstance(Context & context) const3521 TestInstance* MaxWorkGroupSizeTest::createInstance (Context& context) const
3522 {
3523 return new MaxWorkGroupSizeInstance(context, m_params);
3524 }
3525
checkSupport(Context & context) const3526 void MaxWorkGroupSizeTest::checkSupport (Context& context) const
3527 {
3528 const auto& vki = context.getInstanceInterface();
3529 const auto physicalDevice = context.getPhysicalDevice();
3530
3531 const auto properties = vk::getPhysicalDeviceProperties(vki, physicalDevice);
3532 const auto invocations = getInvocations(m_params, vki, physicalDevice, &properties);
3533
3534 if (invocations > properties.limits.maxComputeWorkGroupInvocations)
3535 TCU_FAIL("Reported workgroup size limit in the axis is greater than the global invocation limit");
3536
3537 if (properties.limits.maxStorageBufferRange / static_cast<deUint32>(sizeof(deUint32)) < invocations)
3538 TCU_THROW(NotSupportedError, "Maximum supported storage buffer range too small");
3539 }
3540
MaxWorkGroupSizeInstance(Context & context,const MaxWorkGroupSizeTest::Params & params)3541 MaxWorkGroupSizeInstance::MaxWorkGroupSizeInstance (Context& context, const MaxWorkGroupSizeTest::Params& params)
3542 : vkt::TestInstance (context)
3543 , m_params (params)
3544 {}
3545
iterate(void)3546 tcu::TestStatus MaxWorkGroupSizeInstance::iterate (void)
3547 {
3548 const auto& vki = m_context.getInstanceInterface();
3549 const auto& vkd = m_context.getDeviceInterface();
3550 const auto physicalDevice = m_context.getPhysicalDevice();
3551 const auto device = m_context.getDevice();
3552 auto& alloc = m_context.getDefaultAllocator();
3553 const auto queueIndex = m_context.getUniversalQueueFamilyIndex();
3554 const auto queue = m_context.getUniversalQueue();
3555 auto& log = m_context.getTestContext().getLog();
3556
3557 const auto axis = MaxWorkGroupSizeTest::getIndex(m_params.axis);
3558 const auto invocations = MaxWorkGroupSizeTest::getInvocations(m_params, vki, physicalDevice);
3559 const auto ssboSize = static_cast<vk::VkDeviceSize>(MaxWorkGroupSizeTest::getSSBOSize(invocations));
3560
3561 log
3562 << tcu::TestLog::Message
3563 << "Running test with " << invocations << " invocations on axis " << axis << " using a storage buffer size of " << ssboSize << " bytes"
3564 << tcu::TestLog::EndMessage
3565 ;
3566
3567 // Main SSBO buffer.
3568 const auto ssboInfo = vk::makeBufferCreateInfo(ssboSize, vk::VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
3569 vk::BufferWithMemory ssbo (vkd, device, alloc, ssboInfo, vk::MemoryRequirement::HostVisible);
3570
3571 // Shader module.
3572 const auto shaderModule = vk::createShaderModule(vkd, device, m_context.getBinaryCollection().get("comp"), 0u);
3573
3574 // Descriptor set layouts.
3575 vk::DescriptorSetLayoutBuilder layoutBuilder;
3576 layoutBuilder.addSingleBinding(vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, vk::VK_SHADER_STAGE_COMPUTE_BIT);
3577 const auto descriptorSetLayout = layoutBuilder.build(vkd, device);
3578
3579 // Specialization constants: set the number of invocations in the appropriate local size id.
3580 const auto entrySize = static_cast<deUintptr>(sizeof(deInt32));
3581 deInt32 specializationData[3] = { 1, 1, 1 };
3582 specializationData[axis] = static_cast<deInt32>(invocations);
3583
3584 const vk::VkSpecializationMapEntry specializationMaps[3] =
3585 {
3586 {
3587 0u, // deUint32 constantID;
3588 0u, // deUint32 offset;
3589 entrySize, // deUintptr size;
3590 },
3591 {
3592 1u, // deUint32 constantID;
3593 static_cast<deUint32>(entrySize), // deUint32 offset;
3594 entrySize, // deUintptr size;
3595 },
3596 {
3597 2u, // deUint32 constantID;
3598 static_cast<deUint32>(entrySize * 2u), // deUint32 offset;
3599 entrySize, // deUintptr size;
3600 },
3601 };
3602
3603 const vk::VkSpecializationInfo specializationInfo =
3604 {
3605 3u, // deUint32 mapEntryCount;
3606 specializationMaps, // const VkSpecializationMapEntry* pMapEntries;
3607 static_cast<deUintptr>(sizeof(specializationData)), // deUintptr dataSize;
3608 specializationData, // const void* pData;
3609 };
3610
3611 // Test pipeline.
3612 const vk::VkPipelineLayoutCreateInfo testPipelineLayoutInfo =
3613 {
3614 vk::VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // VkStructureType sType;
3615 nullptr, // const void* pNext;
3616 0u, // VkPipelineLayoutCreateFlags flags;
3617 1u, // deUint32 setLayoutCount;
3618 &descriptorSetLayout.get(), // const VkDescriptorSetLayout* pSetLayouts;
3619 0u, // deUint32 pushConstantRangeCount;
3620 nullptr, // const VkPushConstantRange* pPushConstantRanges;
3621 };
3622 const auto testPipelineLayout = vk::createPipelineLayout(vkd, device, &testPipelineLayoutInfo);
3623
3624 const vk::VkComputePipelineCreateInfo testPipelineInfo =
3625 {
3626 vk::VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, // VkStructureType sType;
3627 nullptr, // const void* pNext;
3628 0u, // VkPipelineCreateFlags flags;
3629 { // VkPipelineShaderStageCreateInfo stage;
3630 vk::VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,// VkStructureType sType;
3631 nullptr, // const void* pNext;
3632 0u, // VkPipelineShaderStageCreateFlags flags;
3633 vk::VK_SHADER_STAGE_COMPUTE_BIT, // VkShaderStageFlagBits stage;
3634 shaderModule.get(), // VkShaderModule module;
3635 "main", // const char* pName;
3636 &specializationInfo, // const VkSpecializationInfo* pSpecializationInfo;
3637 },
3638 testPipelineLayout.get(), // VkPipelineLayout layout;
3639 DE_NULL, // VkPipeline basePipelineHandle;
3640 0u, // deInt32 basePipelineIndex;
3641 };
3642 const auto testPipeline = vk::createComputePipeline(vkd, device, DE_NULL, &testPipelineInfo);
3643
3644 // Create descriptor pool and set.
3645 vk::DescriptorPoolBuilder poolBuilder;
3646 poolBuilder.addType(vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
3647 const auto descriptorPool = poolBuilder.build(vkd, device, vk::VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u);
3648 const auto descriptorSet = vk::makeDescriptorSet(vkd, device, descriptorPool.get(), descriptorSetLayout.get());
3649
3650 // Update descriptor set.
3651 const vk::VkDescriptorBufferInfo ssboBufferInfo =
3652 {
3653 ssbo.get(), // VkBuffer buffer;
3654 0u, // VkDeviceSize offset;
3655 VK_WHOLE_SIZE, // VkDeviceSize range;
3656 };
3657
3658 vk::DescriptorSetUpdateBuilder updateBuilder;
3659 updateBuilder.writeSingle(descriptorSet.get(), vk::DescriptorSetUpdateBuilder::Location::binding(0u), vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &ssboBufferInfo);
3660 updateBuilder.update(vkd, device);
3661
3662 // Clear buffer.
3663 auto& ssboAlloc = ssbo.getAllocation();
3664 void* ssboPtr = ssboAlloc.getHostPtr();
3665 deMemset(ssboPtr, 0, static_cast<size_t>(ssboSize));
3666 vk::flushAlloc(vkd, device, ssboAlloc);
3667
3668 // Run pipelines.
3669 const auto cmdPool = vk::makeCommandPool(vkd, device, queueIndex);
3670 const auto cmdBUfferPtr = vk::allocateCommandBuffer(vkd, device, cmdPool.get(), vk::VK_COMMAND_BUFFER_LEVEL_PRIMARY);
3671 const auto cmdBuffer = cmdBUfferPtr.get();
3672
3673 vk::beginCommandBuffer(vkd, cmdBuffer);
3674
3675 // Run the main test shader.
3676 const auto hostToComputeBarrier = vk::makeBufferMemoryBarrier(vk::VK_ACCESS_HOST_WRITE_BIT, vk::VK_ACCESS_SHADER_WRITE_BIT, ssbo.get(), 0ull, VK_WHOLE_SIZE);
3677 vkd.cmdPipelineBarrier(cmdBuffer, vk::VK_PIPELINE_STAGE_HOST_BIT, vk::VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0u, 0u, nullptr, 1u, &hostToComputeBarrier, 0u, nullptr);
3678
3679 vkd.cmdBindPipeline(cmdBuffer, vk::VK_PIPELINE_BIND_POINT_COMPUTE, testPipeline.get());
3680 vkd.cmdBindDescriptorSets(cmdBuffer, vk::VK_PIPELINE_BIND_POINT_COMPUTE, testPipelineLayout.get(), 0u, 1u, &descriptorSet.get(), 0u, nullptr);
3681 vkd.cmdDispatch(cmdBuffer, 1u, 1u, 1u);
3682
3683 const auto computeToHostBarrier = vk::makeBufferMemoryBarrier(vk::VK_ACCESS_SHADER_WRITE_BIT, vk::VK_ACCESS_HOST_READ_BIT, ssbo.get(), 0ull, VK_WHOLE_SIZE);
3684 vkd.cmdPipelineBarrier(cmdBuffer, vk::VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, vk::VK_PIPELINE_STAGE_HOST_BIT, 0u, 0u, nullptr, 1u, &computeToHostBarrier, 0u, nullptr);
3685
3686 vk::endCommandBuffer(vkd, cmdBuffer);
3687 vk::submitCommandsAndWait(vkd, device, queue, cmdBuffer);
3688
3689 // Verify buffer contents.
3690 vk::invalidateAlloc(vkd, device, ssboAlloc);
3691 std::unique_ptr<deUint32[]> valuesArray (new deUint32[invocations]);
3692 deUint32* valuesPtr = valuesArray.get();
3693 deMemcpy(valuesPtr, ssboPtr, static_cast<size_t>(ssboSize));
3694
3695 std::string errorMsg;
3696 bool ok = true;
3697
3698 for (size_t i = 0; i < invocations; ++i)
3699 {
3700 if (valuesPtr[i] != 1u)
3701 {
3702 ok = false;
3703 errorMsg = "Found invalid value for invocation index " + de::toString(i) + ": expected 1u and found " + de::toString(valuesPtr[i]);
3704 break;
3705 }
3706 }
3707
3708 if (!ok)
3709 return tcu::TestStatus::fail(errorMsg);
3710 return tcu::TestStatus::pass("Pass");
3711 }
3712
3713 namespace EmptyShaderTest
3714 {
3715
createProgram(SourceCollections & dst)3716 void createProgram (SourceCollections& dst)
3717 {
3718 dst.glslSources.add("comp") << glu::ComputeSource(
3719 "#version 310 es\n"
3720 "layout (local_size_x = 1) in;\n"
3721 "void main (void) {}\n"
3722 );
3723 }
3724
createTest(Context & context)3725 tcu::TestStatus createTest (Context& context)
3726 {
3727 const DeviceInterface& vk = context.getDeviceInterface();
3728 const VkDevice device = context.getDevice();
3729 const VkQueue queue = context.getUniversalQueue();
3730 const deUint32 queueFamilyIndex = context.getUniversalQueueFamilyIndex();
3731
3732 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, context.getBinaryCollection().get("comp"), 0u));
3733
3734 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device));
3735 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
3736
3737 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
3738 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3739
3740 // Start recording commands
3741
3742 beginCommandBuffer(vk, *cmdBuffer);
3743
3744 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
3745
3746 const tcu::IVec3 workGroups(1, 1, 1);
3747 vk.cmdDispatch(*cmdBuffer, workGroups.x(), workGroups.y(), workGroups.z());
3748
3749 endCommandBuffer(vk, *cmdBuffer);
3750
3751 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
3752
3753 return tcu::TestStatus::pass("Compute succeeded");
3754 }
3755
3756 } // EmptyShaderTest ns
3757 } // anonymous
3758
createBasicComputeShaderTests(tcu::TestContext & testCtx)3759 tcu::TestCaseGroup* createBasicComputeShaderTests (tcu::TestContext& testCtx)
3760 {
3761 de::MovePtr<tcu::TestCaseGroup> basicComputeTests(new tcu::TestCaseGroup(testCtx, "basic", "Basic compute tests"));
3762
3763 addFunctionCaseWithPrograms(basicComputeTests.get(), "empty_shader", "Shader that does nothing", EmptyShaderTest::createProgram, EmptyShaderTest::createTest);
3764
3765 basicComputeTests->addChild(new ConcurrentCompute(testCtx, "concurrent_compute", "Concurrent compute test"));
3766
3767 basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_x", "Use an empty workgroup with size 0 on the X axis", tcu::UVec3(0u, 2u, 3u)));
3768 basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_y", "Use an empty workgroup with size 0 on the Y axis", tcu::UVec3(2u, 0u, 3u)));
3769 basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_z", "Use an empty workgroup with size 0 on the Z axis", tcu::UVec3(2u, 3u, 0u)));
3770 basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_all", "Use an empty workgroup with size 0 on the X, Y and Z axes", tcu::UVec3(0u, 0u, 0u)));
3771
3772 basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_x", "Use the maximum work group size on the X axis", MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::X}));
3773 basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_y", "Use the maximum work group size on the Y axis", MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::Y}));
3774 basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_z", "Use the maximum work group size on the Z axis", MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::Z}));
3775
3776 basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx, "ubo_to_ssbo_single_invocation", "Copy from UBO to SSBO, inverting bits", 256, tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3777 basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx, "ubo_to_ssbo_single_group", "Copy from UBO to SSBO, inverting bits", 1024, tcu::IVec3(2,1,4), tcu::IVec3(1,1,1)));
3778 basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx, "ubo_to_ssbo_multiple_invocations", "Copy from UBO to SSBO, inverting bits", 1024, tcu::IVec3(1,1,1), tcu::IVec3(2,4,1)));
3779 basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx, "ubo_to_ssbo_multiple_groups", "Copy from UBO to SSBO, inverting bits", 1024, tcu::IVec3(1,4,2), tcu::IVec3(2,2,4)));
3780
3781 basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx, "copy_ssbo_single_invocation", "Copy between SSBOs, inverting bits", 256, tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3782 basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx, "copy_ssbo_multiple_invocations", "Copy between SSBOs, inverting bits", 1024, tcu::IVec3(1,1,1), tcu::IVec3(2,4,1)));
3783 basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx, "copy_ssbo_multiple_groups", "Copy between SSBOs, inverting bits", 1024, tcu::IVec3(1,4,2), tcu::IVec3(2,2,4)));
3784
3785 basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx, "ssbo_rw_single_invocation", "Read and write same SSBO", 256, true, tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3786 basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx, "ssbo_rw_multiple_groups", "Read and write same SSBO", 1024, true, tcu::IVec3(1,4,2), tcu::IVec3(2,2,4)));
3787 basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx, "ssbo_unsized_arr_single_invocation", "Read and write same SSBO", 256, false, tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3788 basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx, "ssbo_unsized_arr_multiple_groups", "Read and write same SSBO", 1024, false, tcu::IVec3(1,4,2), tcu::IVec3(2,2,4)));
3789
3790 basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx, "write_multiple_arr_single_invocation", "Write to multiple SSBOs", 256, true, tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3791 basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx, "write_multiple_arr_multiple_groups", "Write to multiple SSBOs", 1024, true, tcu::IVec3(1,4,2), tcu::IVec3(2,2,4)));
3792 basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx, "write_multiple_unsized_arr_single_invocation", "Write to multiple SSBOs", 256, false, tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3793 basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx, "write_multiple_unsized_arr_multiple_groups", "Write to multiple SSBOs", 1024, false, tcu::IVec3(1,4,2), tcu::IVec3(2,2,4)));
3794
3795 basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx, "ssbo_local_barrier_single_invocation", "SSBO local barrier usage", tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3796 basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx, "ssbo_local_barrier_single_group", "SSBO local barrier usage", tcu::IVec3(3,2,5), tcu::IVec3(1,1,1)));
3797 basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx, "ssbo_local_barrier_multiple_groups", "SSBO local barrier usage", tcu::IVec3(3,4,1), tcu::IVec3(2,7,3)));
3798
3799 basicComputeTests->addChild(new SSBOBarrierTest(testCtx, "ssbo_cmd_barrier_single", "SSBO memory barrier usage", tcu::IVec3(1,1,1)));
3800 basicComputeTests->addChild(new SSBOBarrierTest(testCtx, "ssbo_cmd_barrier_multiple", "SSBO memory barrier usage", tcu::IVec3(11,5,7)));
3801
3802 basicComputeTests->addChild(new SharedVarTest(testCtx, "shared_var_single_invocation", "Basic shared variable usage", tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3803 basicComputeTests->addChild(new SharedVarTest(testCtx, "shared_var_single_group", "Basic shared variable usage", tcu::IVec3(3,2,5), tcu::IVec3(1,1,1)));
3804 basicComputeTests->addChild(new SharedVarTest(testCtx, "shared_var_multiple_invocations", "Basic shared variable usage", tcu::IVec3(1,1,1), tcu::IVec3(2,5,4)));
3805 basicComputeTests->addChild(new SharedVarTest(testCtx, "shared_var_multiple_groups", "Basic shared variable usage", tcu::IVec3(3,4,1), tcu::IVec3(2,7,3)));
3806
3807 basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx, "shared_atomic_op_single_invocation", "Atomic operation with shared var", tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3808 basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx, "shared_atomic_op_single_group", "Atomic operation with shared var", tcu::IVec3(3,2,5), tcu::IVec3(1,1,1)));
3809 basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx, "shared_atomic_op_multiple_invocations", "Atomic operation with shared var", tcu::IVec3(1,1,1), tcu::IVec3(2,5,4)));
3810 basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx, "shared_atomic_op_multiple_groups", "Atomic operation with shared var", tcu::IVec3(3,4,1), tcu::IVec3(2,7,3)));
3811
3812 basicComputeTests->addChild(new CopyImageToSSBOTest(testCtx, "copy_image_to_ssbo_small", "Image to SSBO copy", tcu::IVec2(1,1), tcu::IVec2(64,64)));
3813 basicComputeTests->addChild(new CopyImageToSSBOTest(testCtx, "copy_image_to_ssbo_large", "Image to SSBO copy", tcu::IVec2(2,4), tcu::IVec2(512,512)));
3814
3815 basicComputeTests->addChild(new CopySSBOToImageTest(testCtx, "copy_ssbo_to_image_small", "SSBO to image copy", tcu::IVec2(1, 1), tcu::IVec2(64, 64)));
3816 basicComputeTests->addChild(new CopySSBOToImageTest(testCtx, "copy_ssbo_to_image_large", "SSBO to image copy", tcu::IVec2(2, 4), tcu::IVec2(512, 512)));
3817
3818 basicComputeTests->addChild(new ImageAtomicOpTest(testCtx, "image_atomic_op_local_size_1", "Atomic operation with image", 1, tcu::IVec2(64,64)));
3819 basicComputeTests->addChild(new ImageAtomicOpTest(testCtx, "image_atomic_op_local_size_8", "Atomic operation with image", 8, tcu::IVec2(64,64)));
3820
3821 basicComputeTests->addChild(new ImageBarrierTest(testCtx, "image_barrier_single", "Image barrier", tcu::IVec2(1,1)));
3822 basicComputeTests->addChild(new ImageBarrierTest(testCtx, "image_barrier_multiple", "Image barrier", tcu::IVec2(64,64)));
3823
3824 basicComputeTests->addChild(cts_amber::createAmberTestCase(testCtx, "write_ssbo_array", "", "compute", "write_ssbo_array.amber"));
3825
3826 return basicComputeTests.release();
3827 }
3828
createBasicDeviceGroupComputeShaderTests(tcu::TestContext & testCtx)3829 tcu::TestCaseGroup* createBasicDeviceGroupComputeShaderTests (tcu::TestContext& testCtx)
3830 {
3831 de::MovePtr<tcu::TestCaseGroup> deviceGroupComputeTests(new tcu::TestCaseGroup(testCtx, "device_group", "Basic device group compute tests"));
3832
3833 deviceGroupComputeTests->addChild(new DispatchBaseTest(testCtx, "dispatch_base", "Compute shader with base groups", 32768, tcu::IVec3(4,2,4), tcu::IVec3(16,8,8), tcu::IVec3(4,8,8)));
3834 deviceGroupComputeTests->addChild(new DeviceIndexTest(testCtx, "device_index", "Compute shader using deviceIndex in SPIRV", 96, tcu::IVec3(3,2,1), tcu::IVec3(2,4,1)));
3835
3836 return deviceGroupComputeTests.release();
3837
3838 }
3839 } // compute
3840 } // vkt
3841