1 /*------------------------------------------------------------------------
2  * Vulkan Conformance Tests
3  * ------------------------
4  *
5  * Copyright (c) 2015-2024 The Khronos Group Inc.
6  * Copyright (c) 2017 Google Inc.
7  *
8  * Licensed under the Apache License, Version 2.0 (the "License");
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  *
12  *      http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  *
20  *//*!
21  * \file
22  * \brief Atomic operations (OpAtomic*) tests.
23  *//*--------------------------------------------------------------------*/
24 
25 #include "vktAtomicOperationTests.hpp"
26 #include "vktShaderExecutor.hpp"
27 
28 #include "vkRefUtil.hpp"
29 #include "vkMemUtil.hpp"
30 #include "vkQueryUtil.hpp"
31 #include "vkObjUtil.hpp"
32 #include "vkBarrierUtil.hpp"
33 #include "vkCmdUtil.hpp"
34 #include "vktTestGroupUtil.hpp"
35 
36 #include "tcuTestLog.hpp"
37 #include "tcuStringTemplate.hpp"
38 #include "tcuResultCollector.hpp"
39 
40 #include "deFloat16.h"
41 #include "deMath.hpp"
42 #include "deStringUtil.hpp"
43 #include "deSharedPtr.hpp"
44 #include "deRandom.hpp"
45 #include "deArrayUtil.hpp"
46 
47 #include <string>
48 #include <memory>
49 #include <cmath>
50 
51 namespace vkt
52 {
53 namespace shaderexecutor
54 {
55 
56 namespace
57 {
58 
59 using de::MovePtr;
60 using de::UniquePtr;
61 using std::vector;
62 
63 using namespace vk;
64 
65 enum class AtomicMemoryType
66 {
67     BUFFER = 0, // Normal buffer.
68     SHARED,     // Shared global struct in a compute workgroup.
69     REFERENCE,  // Buffer passed as a reference.
70     PAYLOAD,    // Task payload.
71 };
72 
73 // Helper struct to indicate the shader type and if it should use shared global memory.
74 class AtomicShaderType
75 {
76 public:
AtomicShaderType(glu::ShaderType type,AtomicMemoryType memoryType)77     AtomicShaderType(glu::ShaderType type, AtomicMemoryType memoryType) : m_type(type), m_atomicMemoryType(memoryType)
78     {
79         // Shared global memory can only be set to true with compute, task and mesh shaders.
80         DE_ASSERT(memoryType != AtomicMemoryType::SHARED || type == glu::SHADERTYPE_COMPUTE ||
81                   type == glu::SHADERTYPE_TASK || type == glu::SHADERTYPE_MESH);
82 
83         // Task payload memory can only be tested in task shaders.
84         DE_ASSERT(memoryType != AtomicMemoryType::PAYLOAD || type == glu::SHADERTYPE_TASK);
85     }
86 
getType(void) const87     glu::ShaderType getType(void) const
88     {
89         return m_type;
90     }
getMemoryType(void) const91     AtomicMemoryType getMemoryType(void) const
92     {
93         return m_atomicMemoryType;
94     }
isSharedLike(void) const95     bool isSharedLike(void) const
96     {
97         return m_atomicMemoryType == AtomicMemoryType::SHARED || m_atomicMemoryType == AtomicMemoryType::PAYLOAD;
98     }
isMeshShadingStage(void) const99     bool isMeshShadingStage(void) const
100     {
101         return (m_type == glu::SHADERTYPE_TASK || m_type == glu::SHADERTYPE_MESH);
102     }
103 
104 private:
105     glu::ShaderType m_type;
106     AtomicMemoryType m_atomicMemoryType;
107 };
108 
109 // Buffer helper
110 class Buffer
111 {
112 public:
113     Buffer(Context &context, VkBufferUsageFlags usage, size_t size, bool useRef);
114 
getBuffer(void) const115     VkBuffer getBuffer(void) const
116     {
117         return *m_buffer;
118     }
getHostPtr(void) const119     void *getHostPtr(void) const
120     {
121         return m_allocation->getHostPtr();
122     }
123     void flush(void);
124     void invalidate(void);
125 
126 private:
127     const DeviceInterface &m_vkd;
128     const VkDevice m_device;
129     const VkQueue m_queue;
130     const uint32_t m_queueIndex;
131     const Unique<VkBuffer> m_buffer;
132     const UniquePtr<Allocation> m_allocation;
133 };
134 
135 typedef de::SharedPtr<Buffer> BufferSp;
136 
createBuffer(const DeviceInterface & vkd,VkDevice device,VkDeviceSize size,VkBufferUsageFlags usageFlags)137 Move<VkBuffer> createBuffer(const DeviceInterface &vkd, VkDevice device, VkDeviceSize size,
138                             VkBufferUsageFlags usageFlags)
139 {
140     const VkBufferCreateInfo createInfo = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
141                                            nullptr,
142                                            (VkBufferCreateFlags)0,
143                                            size,
144                                            usageFlags,
145                                            VK_SHARING_MODE_EXCLUSIVE,
146                                            0u,
147                                            nullptr};
148     return createBuffer(vkd, device, &createInfo);
149 }
150 
allocateAndBindMemory(const DeviceInterface & vkd,VkDevice device,Allocator & allocator,VkBuffer buffer,bool useRef)151 MovePtr<Allocation> allocateAndBindMemory(const DeviceInterface &vkd, VkDevice device, Allocator &allocator,
152                                           VkBuffer buffer, bool useRef)
153 {
154     const MemoryRequirement allocationType =
155         (MemoryRequirement::HostVisible | (useRef ? MemoryRequirement::DeviceAddress : MemoryRequirement::Any));
156     MovePtr<Allocation> alloc(allocator.allocate(getBufferMemoryRequirements(vkd, device, buffer), allocationType));
157 
158     VK_CHECK(vkd.bindBufferMemory(device, buffer, alloc->getMemory(), alloc->getOffset()));
159 
160     return alloc;
161 }
162 
Buffer(Context & context,VkBufferUsageFlags usage,size_t size,bool useRef)163 Buffer::Buffer(Context &context, VkBufferUsageFlags usage, size_t size, bool useRef)
164     : m_vkd(context.getDeviceInterface())
165     , m_device(context.getDevice())
166     , m_queue(context.getUniversalQueue())
167     , m_queueIndex(context.getUniversalQueueFamilyIndex())
168     , m_buffer(createBuffer(context.getDeviceInterface(), context.getDevice(), (VkDeviceSize)size, usage))
169     , m_allocation(allocateAndBindMemory(context.getDeviceInterface(), context.getDevice(),
170                                          context.getDefaultAllocator(), *m_buffer, useRef))
171 {
172 }
173 
flush(void)174 void Buffer::flush(void)
175 {
176     flushMappedMemoryRange(m_vkd, m_device, m_allocation->getMemory(), m_allocation->getOffset(), VK_WHOLE_SIZE);
177 }
178 
invalidate(void)179 void Buffer::invalidate(void)
180 {
181     const auto cmdPool = vk::makeCommandPool(m_vkd, m_device, m_queueIndex);
182     const auto cmdBufferPtr =
183         vk::allocateCommandBuffer(m_vkd, m_device, cmdPool.get(), VK_COMMAND_BUFFER_LEVEL_PRIMARY);
184     const auto cmdBuffer     = cmdBufferPtr.get();
185     const auto bufferBarrier = vk::makeBufferMemoryBarrier(VK_ACCESS_MEMORY_WRITE_BIT, VK_ACCESS_HOST_READ_BIT,
186                                                            m_buffer.get(), 0ull, VK_WHOLE_SIZE);
187 
188     beginCommandBuffer(m_vkd, cmdBuffer);
189     m_vkd.cmdPipelineBarrier(cmdBuffer, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0u, 0u, nullptr,
190                              1u, &bufferBarrier, 0u, nullptr);
191     endCommandBuffer(m_vkd, cmdBuffer);
192     submitCommandsAndWait(m_vkd, m_device, m_queue, cmdBuffer);
193 
194     invalidateMappedMemoryRange(m_vkd, m_device, m_allocation->getMemory(), m_allocation->getOffset(), VK_WHOLE_SIZE);
195 }
196 
197 // Tests
198 
199 enum AtomicOperation
200 {
201     ATOMIC_OP_EXCHANGE = 0,
202     ATOMIC_OP_COMP_SWAP,
203     ATOMIC_OP_ADD,
204     ATOMIC_OP_MIN,
205     ATOMIC_OP_MAX,
206     ATOMIC_OP_AND,
207     ATOMIC_OP_OR,
208     ATOMIC_OP_XOR,
209 
210     ATOMIC_OP_LAST
211 };
212 
atomicOp2Str(AtomicOperation op)213 std::string atomicOp2Str(AtomicOperation op)
214 {
215     static const char *const s_names[] = {"atomicExchange", "atomicCompSwap", "atomicAdd", "atomicMin",
216                                           "atomicMax",      "atomicAnd",      "atomicOr",  "atomicXor"};
217     return de::getSizedArrayElement<ATOMIC_OP_LAST>(s_names, op);
218 }
219 
220 enum
221 {
222     NUM_ELEMENTS = 32
223 };
224 
225 enum DataType
226 {
227     DATA_TYPE_FLOAT16 = 0,
228     DATA_TYPE_FLOAT16X2,
229     DATA_TYPE_FLOAT16X4,
230     DATA_TYPE_INT32,
231     DATA_TYPE_UINT32,
232     DATA_TYPE_FLOAT32,
233     DATA_TYPE_INT64,
234     DATA_TYPE_UINT64,
235     DATA_TYPE_FLOAT64,
236 
237     DATA_TYPE_LAST
238 };
239 
dataType2Str(DataType type)240 std::string dataType2Str(DataType type)
241 {
242     static const char *const s_names[] = {
243         "float16_t", "f16vec2", "f16vec4", "int", "uint", "float", "int64_t", "uint64_t", "double",
244     };
245     return de::getSizedArrayElement<DATA_TYPE_LAST>(s_names, type);
246 }
247 
248 class BufferInterface
249 {
250 public:
251     virtual void setBuffer(void *ptr) = 0;
252 
253     virtual size_t bufferSize() = 0;
254 
255     virtual void fillWithTestData(de::Random &rnd) = 0;
256 
257     virtual void checkResults(tcu::ResultCollector &resultCollector) = 0;
258 
~BufferInterface()259     virtual ~BufferInterface()
260     {
261     }
262 };
263 
264 template <typename dataTypeT>
265 class TestBuffer : public BufferInterface
266 {
267 public:
TestBuffer(AtomicOperation atomicOp)268     TestBuffer(AtomicOperation atomicOp) : m_atomicOp(atomicOp)
269     {
270     }
271 
272     template <typename T>
273     struct BufferData
274     {
275         // Use half the number of elements for inout to cause overlap between atomic operations.
276         // Each inout element at index i will have two atomic operations using input from
277         // indices i and i + NUM_ELEMENTS / 2.
278         T inout[NUM_ELEMENTS / 2];
279         T input[NUM_ELEMENTS];
280         T compare[NUM_ELEMENTS];
281         T output[NUM_ELEMENTS];
282         T invocationHitCount[NUM_ELEMENTS];
283         int32_t index;
284     };
285 
setBuffer(void * ptr)286     virtual void setBuffer(void *ptr)
287     {
288         m_ptr = static_cast<BufferData<dataTypeT> *>(ptr);
289     }
290 
bufferSize()291     virtual size_t bufferSize()
292     {
293         return sizeof(BufferData<dataTypeT>);
294     }
295 
fillWithTestData(de::Random & rnd)296     virtual void fillWithTestData(de::Random &rnd)
297     {
298         dataTypeT pattern;
299         deMemset(&pattern, 0xcd, sizeof(dataTypeT));
300 
301         for (int i = 0; i < NUM_ELEMENTS / 2; i++)
302         {
303             m_ptr->inout[i] = static_cast<dataTypeT>(rnd.getUint64());
304             // The first half of compare elements match with every even index.
305             // The second half matches with odd indices. This causes the
306             // overlapping operations to only select one.
307             m_ptr->compare[i]                    = m_ptr->inout[i] + (i % 2);
308             m_ptr->compare[i + NUM_ELEMENTS / 2] = m_ptr->inout[i] + 1 - (i % 2);
309         }
310         for (int i = 0; i < NUM_ELEMENTS; i++)
311         {
312             m_ptr->input[i]              = static_cast<dataTypeT>(rnd.getUint64());
313             m_ptr->output[i]             = pattern;
314             m_ptr->invocationHitCount[i] = 0;
315         }
316         m_ptr->index = 0;
317 
318         // Take a copy to be used when calculating expected values.
319         m_original = *m_ptr;
320     }
321 
checkResults(tcu::ResultCollector & resultCollector)322     virtual void checkResults(tcu::ResultCollector &resultCollector)
323     {
324         checkOperation(m_original, *m_ptr, resultCollector);
325     }
326 
327     template <typename T>
328     struct Expected
329     {
330         T m_inout;
331         T m_output[2];
332 
Expectedvkt::shaderexecutor::__anonc9d8e1400111::TestBuffer::Expected333         Expected(T inout, T output0, T output1) : m_inout(inout)
334         {
335             m_output[0] = output0;
336             m_output[1] = output1;
337         }
338 
comparevkt::shaderexecutor::__anonc9d8e1400111::TestBuffer::Expected339         bool compare(T inout, T output0, T output1)
340         {
341             return (deMemCmp((const void *)&m_inout, (const void *)&inout, sizeof(inout)) == 0 &&
342                     deMemCmp((const void *)&m_output[0], (const void *)&output0, sizeof(output0)) == 0 &&
343                     deMemCmp((const void *)&m_output[1], (const void *)&output1, sizeof(output1)) == 0);
344         }
345     };
346 
347     void checkOperation(const BufferData<dataTypeT> &original, const BufferData<dataTypeT> &result,
348                         tcu::ResultCollector &resultCollector);
349 
350     const AtomicOperation m_atomicOp;
351 
352     BufferData<dataTypeT> *m_ptr;
353     BufferData<dataTypeT> m_original;
354 };
355 
356 template <typename T>
sloppyFPCompare(T x,T y)357 bool sloppyFPCompare(T x, T y)
358 {
359     return fabs(deToDouble(x) - deToDouble(y)) < 0.00001;
360 }
361 
362 template <>
sloppyFPCompare(deFloat16 x,deFloat16 y)363 bool sloppyFPCompare<deFloat16>(deFloat16 x, deFloat16 y)
364 {
365     return fabs(deToDouble(x) - deToDouble(y)) < 0.01;
366 }
367 
368 template <typename T>
nanSafeSloppyEquals(T x,T y)369 bool nanSafeSloppyEquals(T x, T y)
370 {
371     if (deIsIEEENaN(x) && deIsIEEENaN(y))
372         return true;
373 
374     if (deIsIEEENaN(x) || deIsIEEENaN(y))
375         return false;
376 
377     return sloppyFPCompare(x, y);
378 }
379 
380 template <typename dataTypeT, uint32_t VecSize = 1>
381 class TestBufferFloatingPoint : public BufferInterface
382 {
383 public:
TestBufferFloatingPoint(AtomicOperation atomicOp)384     TestBufferFloatingPoint(AtomicOperation atomicOp) : m_atomicOp(atomicOp)
385     {
386     }
387 
388     template <typename T, uint32_t VecSize2>
389     struct BufferDataFloatingPoint
390     {
391         // Use half the number of elements for inout to cause overlap between atomic operations.
392         // Each inout element at index i will have two atomic operations using input from
393         // indices i and i + NUM_ELEMENTS / 2.
394         T inout[NUM_ELEMENTS / 2 * VecSize2];
395         T input[NUM_ELEMENTS * VecSize2];
396         T compare[NUM_ELEMENTS * VecSize2];
397         T output[NUM_ELEMENTS * VecSize2];
398         int32_t invocationHitCount[NUM_ELEMENTS];
399         int32_t index;
400     };
401 
setBuffer(void * ptr)402     virtual void setBuffer(void *ptr)
403     {
404         m_ptr = static_cast<BufferDataFloatingPoint<dataTypeT, VecSize> *>(ptr);
405     }
406 
bufferSize()407     virtual size_t bufferSize()
408     {
409         return sizeof(BufferDataFloatingPoint<dataTypeT, VecSize>);
410     }
411 
fillWithTestData(de::Random & rnd)412     virtual void fillWithTestData(de::Random &rnd)
413     {
414         dataTypeT pattern;
415         deMemset(&pattern, 0xcd, sizeof(dataTypeT));
416 
417         for (uint32_t i = 0; i < (NUM_ELEMENTS / 2) * VecSize; i++)
418         {
419             m_ptr->inout[i] = deToFloatType<dataTypeT>(rnd.getFloat());
420         }
421         for (uint32_t i = 0; i < NUM_ELEMENTS * VecSize; i++)
422         {
423             m_ptr->input[i]  = deToFloatType<dataTypeT>(rnd.getFloat());
424             m_ptr->output[i] = pattern;
425             // These aren't used by any of the float tests
426             m_ptr->compare[i] = deToFloatType<dataTypeT>(0.0);
427         }
428         for (int i = 0; i < NUM_ELEMENTS; i++)
429         {
430             m_ptr->invocationHitCount[i] = 0;
431         }
432         // Add special cases for NaN and +/-0
433         // 0: min(sNaN, x)
434         m_ptr->inout[0] = deSignalingNaN<dataTypeT>();
435         // 1: min(x, sNaN)
436         m_ptr->input[1 * 2 + 0] = deSignalingNaN<dataTypeT>();
437         // 2: min(qNaN, x)
438         m_ptr->inout[2] = deQuietNaN<dataTypeT>();
439         // 3: min(x, qNaN)
440         m_ptr->input[3 * 2 + 0] = deQuietNaN<dataTypeT>();
441         // 4: min(NaN, NaN)
442         m_ptr->inout[4]         = deSignalingNaN<dataTypeT>();
443         m_ptr->input[4 * 2 + 0] = deQuietNaN<dataTypeT>();
444         m_ptr->input[4 * 2 + 1] = deQuietNaN<dataTypeT>();
445         // 5: min(+0, -0)
446         m_ptr->inout[5]         = deToFloatType<dataTypeT>(-0.0);
447         m_ptr->input[5 * 2 + 0] = deToFloatType<dataTypeT>(0.0);
448         m_ptr->input[5 * 2 + 1] = deToFloatType<dataTypeT>(0.0);
449 
450         m_ptr->index = 0;
451 
452         // Take a copy to be used when calculating expected values.
453         m_original = *m_ptr;
454     }
455 
checkResults(tcu::ResultCollector & resultCollector)456     virtual void checkResults(tcu::ResultCollector &resultCollector)
457     {
458         checkOperationFloatingPoint(m_original, *m_ptr, resultCollector);
459     }
460 
461     template <typename T>
462     struct Expected
463     {
464         T m_inout;
465         T m_output[2];
466 
Expectedvkt::shaderexecutor::__anonc9d8e1400111::TestBufferFloatingPoint::Expected467         Expected(T inout, T output0, T output1) : m_inout(inout)
468         {
469             m_output[0] = output0;
470             m_output[1] = output1;
471         }
472 
comparevkt::shaderexecutor::__anonc9d8e1400111::TestBufferFloatingPoint::Expected473         bool compare(T inout, T output0, T output1)
474         {
475             return nanSafeSloppyEquals(m_inout, inout) && nanSafeSloppyEquals(m_output[0], output0) &&
476                    nanSafeSloppyEquals(m_output[1], output1);
477         }
478     };
479 
480     void checkOperationFloatingPoint(const BufferDataFloatingPoint<dataTypeT, VecSize> &original,
481                                      const BufferDataFloatingPoint<dataTypeT, VecSize> &result,
482                                      tcu::ResultCollector &resultCollector);
483 
484     const AtomicOperation m_atomicOp;
485 
486     BufferDataFloatingPoint<dataTypeT, VecSize> *m_ptr;
487     BufferDataFloatingPoint<dataTypeT, VecSize> m_original;
488 };
489 
createTestBuffer(DataType type,AtomicOperation atomicOp)490 static BufferInterface *createTestBuffer(DataType type, AtomicOperation atomicOp)
491 {
492     switch (type)
493     {
494     case DATA_TYPE_FLOAT16:
495         return new TestBufferFloatingPoint<deFloat16>(atomicOp);
496     case DATA_TYPE_FLOAT16X2:
497         return new TestBufferFloatingPoint<deFloat16, 2>(atomicOp);
498     case DATA_TYPE_FLOAT16X4:
499         return new TestBufferFloatingPoint<deFloat16, 4>(atomicOp);
500     case DATA_TYPE_INT32:
501         return new TestBuffer<int32_t>(atomicOp);
502     case DATA_TYPE_UINT32:
503         return new TestBuffer<uint32_t>(atomicOp);
504     case DATA_TYPE_FLOAT32:
505         return new TestBufferFloatingPoint<float>(atomicOp);
506     case DATA_TYPE_INT64:
507         return new TestBuffer<int64_t>(atomicOp);
508     case DATA_TYPE_UINT64:
509         return new TestBuffer<uint64_t>(atomicOp);
510     case DATA_TYPE_FLOAT64:
511         return new TestBufferFloatingPoint<double>(atomicOp);
512     default:
513         DE_ASSERT(false);
514         return nullptr;
515     }
516 }
517 
518 // Use template to handle both signed and unsigned cases. SPIR-V should
519 // have separate operations for both.
520 template <typename T>
checkOperation(const BufferData<T> & original,const BufferData<T> & result,tcu::ResultCollector & resultCollector)521 void TestBuffer<T>::checkOperation(const BufferData<T> &original, const BufferData<T> &result,
522                                    tcu::ResultCollector &resultCollector)
523 {
524     // originalInout = original inout
525     // input0 = input at index i
526     // iinput1 = input at index i + NUM_ELEMENTS / 2
527     //
528     // atomic operation will return the memory contents before
529     // the operation and this is stored as output. Two operations
530     // are executed for each InOut value (using input0 and input1).
531     //
532     // Since there is an overlap of two operations per each
533     // InOut element, the outcome of the resulting InOut and
534     // the outputs of the operations have two result candidates
535     // depending on the execution order. Verification passes
536     // if the results match one of these options.
537 
538     for (int elementNdx = 0; elementNdx < NUM_ELEMENTS / 2; elementNdx++)
539     {
540         // Needed when reinterpeting the data as signed values.
541         const T originalInout = *reinterpret_cast<const T *>(&original.inout[elementNdx]);
542         const T input0        = *reinterpret_cast<const T *>(&original.input[elementNdx]);
543         const T input1        = *reinterpret_cast<const T *>(&original.input[elementNdx + NUM_ELEMENTS / 2]);
544 
545         // Expected results are collected to this vector.
546         vector<Expected<T>> exp;
547 
548         switch (m_atomicOp)
549         {
550         case ATOMIC_OP_ADD:
551         {
552             exp.push_back(Expected<T>(originalInout + input0 + input1, originalInout, originalInout + input0));
553             exp.push_back(Expected<T>(originalInout + input0 + input1, originalInout + input1, originalInout));
554         }
555         break;
556 
557         case ATOMIC_OP_AND:
558         {
559             exp.push_back(Expected<T>(originalInout & input0 & input1, originalInout, originalInout & input0));
560             exp.push_back(Expected<T>(originalInout & input0 & input1, originalInout & input1, originalInout));
561         }
562         break;
563 
564         case ATOMIC_OP_OR:
565         {
566             exp.push_back(Expected<T>(originalInout | input0 | input1, originalInout, originalInout | input0));
567             exp.push_back(Expected<T>(originalInout | input0 | input1, originalInout | input1, originalInout));
568         }
569         break;
570 
571         case ATOMIC_OP_XOR:
572         {
573             exp.push_back(Expected<T>(originalInout ^ input0 ^ input1, originalInout, originalInout ^ input0));
574             exp.push_back(Expected<T>(originalInout ^ input0 ^ input1, originalInout ^ input1, originalInout));
575         }
576         break;
577 
578         case ATOMIC_OP_MIN:
579         {
580             exp.push_back(Expected<T>(de::min(de::min(originalInout, input0), input1), originalInout,
581                                       de::min(originalInout, input0)));
582             exp.push_back(Expected<T>(de::min(de::min(originalInout, input0), input1), de::min(originalInout, input1),
583                                       originalInout));
584         }
585         break;
586 
587         case ATOMIC_OP_MAX:
588         {
589             exp.push_back(Expected<T>(de::max(de::max(originalInout, input0), input1), originalInout,
590                                       de::max(originalInout, input0)));
591             exp.push_back(Expected<T>(de::max(de::max(originalInout, input0), input1), de::max(originalInout, input1),
592                                       originalInout));
593         }
594         break;
595 
596         case ATOMIC_OP_EXCHANGE:
597         {
598             exp.push_back(Expected<T>(input1, originalInout, input0));
599             exp.push_back(Expected<T>(input0, input1, originalInout));
600         }
601         break;
602 
603         case ATOMIC_OP_COMP_SWAP:
604         {
605             if (elementNdx % 2 == 0)
606             {
607                 exp.push_back(Expected<T>(input0, originalInout, input0));
608                 exp.push_back(Expected<T>(input0, originalInout, originalInout));
609             }
610             else
611             {
612                 exp.push_back(Expected<T>(input1, input1, originalInout));
613                 exp.push_back(Expected<T>(input1, originalInout, originalInout));
614             }
615         }
616         break;
617 
618         default:
619             DE_FATAL("Unexpected atomic operation.");
620             break;
621         }
622 
623         const T resIo      = result.inout[elementNdx];
624         const T resOutput0 = result.output[elementNdx];
625         const T resOutput1 = result.output[elementNdx + NUM_ELEMENTS / 2];
626 
627         if (!exp[0].compare(resIo, resOutput0, resOutput1) && !exp[1].compare(resIo, resOutput0, resOutput1))
628         {
629             std::ostringstream errorMessage;
630             errorMessage << "ERROR: Result value check failed at index " << elementNdx
631                          << ". Expected one of the two outcomes: InOut = " << tcu::toHex(exp[0].m_inout)
632                          << ", Output0 = " << tcu::toHex(exp[0].m_output[0])
633                          << ", Output1 = " << tcu::toHex(exp[0].m_output[1])
634                          << ", or InOut = " << tcu::toHex(exp[1].m_inout)
635                          << ", Output0 = " << tcu::toHex(exp[1].m_output[0])
636                          << ", Output1 = " << tcu::toHex(exp[1].m_output[1]) << ". Got: InOut = " << tcu::toHex(resIo)
637                          << ", Output0 = " << tcu::toHex(resOutput0) << ", Output1 = " << tcu::toHex(resOutput1)
638                          << ". Using Input0 = " << tcu::toHex(original.input[elementNdx])
639                          << " and Input1 = " << tcu::toHex(original.input[elementNdx + NUM_ELEMENTS / 2]) << ".";
640 
641             resultCollector.fail(errorMessage.str());
642         }
643     }
644 }
645 
646 template <typename T>
handleExceptionalFloatMinMaxValues(vector<T> & values,T x,T y)647 void handleExceptionalFloatMinMaxValues(vector<T> &values, T x, T y)
648 {
649 
650     if (deIsSignalingNaN(x) && deIsSignalingNaN(y))
651     {
652         values.push_back(deQuietNaN<T>());
653         values.push_back(deSignalingNaN<T>());
654     }
655     else if (deIsSignalingNaN(x))
656     {
657         values.push_back(deQuietNaN<T>());
658         values.push_back(deSignalingNaN<T>());
659         if (!deIsIEEENaN(y))
660             values.push_back(y);
661     }
662     else if (deIsSignalingNaN(y))
663     {
664         values.push_back(deQuietNaN<T>());
665         values.push_back(deSignalingNaN<T>());
666         if (!deIsIEEENaN(x))
667             values.push_back(x);
668     }
669     else if (deIsIEEENaN(x) && deIsIEEENaN(y))
670     {
671         // Both quiet NaNs
672         values.push_back(deQuietNaN<T>());
673     }
674     else if (deIsIEEENaN(x))
675     {
676         // One quiet NaN and one non-NaN.
677         values.push_back(y);
678     }
679     else if (deIsIEEENaN(y))
680     {
681         // One quiet NaN and one non-NaN.
682         values.push_back(x);
683     }
684     else if ((deIsPositiveZero(x) && deIsNegativeZero(y)) || (deIsNegativeZero(x) && deIsPositiveZero(y)))
685     {
686         values.push_back(deToFloatType<T>(0.0));
687         values.push_back(deToFloatType<T>(-0.0));
688     }
689 }
690 
691 template <typename T>
floatAdd(T x,T y)692 T floatAdd(T x, T y)
693 {
694     if (deIsIEEENaN(x) || deIsIEEENaN(y))
695         return deQuietNaN<T>();
696     return deToFloatType<T>(deToDouble(x) + deToDouble(y));
697 }
698 
699 template <typename T>
floatMinValues(T x,T y)700 vector<T> floatMinValues(T x, T y)
701 {
702     vector<T> values;
703     handleExceptionalFloatMinMaxValues(values, x, y);
704     if (values.empty())
705     {
706         values.push_back(deToDouble(x) < deToDouble(y) ? x : y);
707     }
708     return values;
709 }
710 
711 template <typename T>
floatMaxValues(T x,T y)712 vector<T> floatMaxValues(T x, T y)
713 {
714     vector<T> values;
715     handleExceptionalFloatMinMaxValues(values, x, y);
716     if (values.empty())
717     {
718         values.push_back(deToDouble(x) > deToDouble(y) ? x : y);
719     }
720     return values;
721 }
722 
723 // Use template to handle both float and double cases. SPIR-V should
724 // have separate operations for both.
725 template <typename T, uint32_t VecSize>
checkOperationFloatingPoint(const BufferDataFloatingPoint<T,VecSize> & original,const BufferDataFloatingPoint<T,VecSize> & result,tcu::ResultCollector & resultCollector)726 void TestBufferFloatingPoint<T, VecSize>::checkOperationFloatingPoint(
727     const BufferDataFloatingPoint<T, VecSize> &original, const BufferDataFloatingPoint<T, VecSize> &result,
728     tcu::ResultCollector &resultCollector)
729 {
730     // originalInout = original inout
731     // input0 = input at index i
732     // iinput1 = input at index i + NUM_ELEMENTS / 2
733     //
734     // atomic operation will return the memory contents before
735     // the operation and this is stored as output. Two operations
736     // are executed for each InOut value (using input0 and input1).
737     //
738     // Since there is an overlap of two operations per each
739     // InOut element, the outcome of the resulting InOut and
740     // the outputs of the operations have two result candidates
741     // depending on the execution order. Verification passes
742     // if the results match one of these options.
743 
744     for (int elementNdx = 0; elementNdx < NUM_ELEMENTS / 2; elementNdx++)
745     {
746         for (uint32_t vecIdx = 0; vecIdx < VecSize; ++vecIdx)
747         {
748             // Needed when reinterpeting the data as signed values.
749             const T originalInout = *reinterpret_cast<const T *>(&original.inout[elementNdx * VecSize + vecIdx]);
750             const T input0        = *reinterpret_cast<const T *>(&original.input[elementNdx * VecSize + vecIdx]);
751             const T input1 =
752                 *reinterpret_cast<const T *>(&original.input[(elementNdx + NUM_ELEMENTS / 2) * VecSize + vecIdx]);
753 
754             // Expected results are collected to this vector.
755             vector<Expected<T>> exp;
756 
757             switch (m_atomicOp)
758             {
759             case ATOMIC_OP_ADD:
760             {
761                 exp.push_back(Expected<T>(floatAdd(floatAdd(originalInout, input0), input1), originalInout,
762                                           floatAdd(originalInout, input0)));
763                 exp.push_back(Expected<T>(floatAdd(floatAdd(originalInout, input1), input0),
764                                           floatAdd(originalInout, input1), originalInout));
765             }
766             break;
767 
768             case ATOMIC_OP_MIN:
769             {
770                 // The case where input0 is combined first
771                 vector<T> minOriginalAndInput0 = floatMinValues(originalInout, input0);
772                 for (T x : minOriginalAndInput0)
773                 {
774                     vector<T> minAll = floatMinValues(x, input1);
775                     for (T y : minAll)
776                     {
777                         exp.push_back(Expected<T>(y, originalInout, x));
778                     }
779                 }
780 
781                 // The case where input1 is combined first
782                 vector<T> minOriginalAndInput1 = floatMinValues(originalInout, input1);
783                 for (T x : minOriginalAndInput1)
784                 {
785                     vector<T> minAll = floatMinValues(x, input0);
786                     for (T y : minAll)
787                     {
788                         exp.push_back(Expected<T>(y, x, originalInout));
789                     }
790                 }
791             }
792             break;
793 
794             case ATOMIC_OP_MAX:
795             {
796                 // The case where input0 is combined first
797                 vector<T> minOriginalAndInput0 = floatMaxValues(originalInout, input0);
798                 for (T x : minOriginalAndInput0)
799                 {
800                     vector<T> minAll = floatMaxValues(x, input1);
801                     for (T y : minAll)
802                     {
803                         exp.push_back(Expected<T>(y, originalInout, x));
804                     }
805                 }
806 
807                 // The case where input1 is combined first
808                 vector<T> minOriginalAndInput1 = floatMaxValues(originalInout, input1);
809                 for (T x : minOriginalAndInput1)
810                 {
811                     vector<T> minAll = floatMaxValues(x, input0);
812                     for (T y : minAll)
813                     {
814                         exp.push_back(Expected<T>(y, x, originalInout));
815                     }
816                 }
817             }
818             break;
819 
820             case ATOMIC_OP_EXCHANGE:
821             {
822                 exp.push_back(Expected<T>(input1, originalInout, input0));
823                 exp.push_back(Expected<T>(input0, input1, originalInout));
824             }
825             break;
826 
827             default:
828                 DE_FATAL("Unexpected atomic operation.");
829                 break;
830             }
831 
832             const T resIo      = result.inout[elementNdx * VecSize + vecIdx];
833             const T resOutput0 = result.output[elementNdx * VecSize + vecIdx];
834             const T resOutput1 = result.output[(elementNdx + NUM_ELEMENTS / 2) * VecSize + vecIdx];
835 
836             bool hasMatch = false;
837             for (Expected<T> e : exp)
838             {
839                 if (e.compare(resIo, resOutput0, resOutput1))
840                 {
841                     hasMatch = true;
842                     break;
843                 }
844             }
845             if (!hasMatch)
846             {
847                 std::ostringstream errorMessage;
848                 errorMessage << "ERROR: Result value check failed at index (" << elementNdx << ", " << vecIdx << ")"
849                              << ". Expected one of the outcomes:";
850 
851                 bool first = true;
852                 for (Expected<T> e : exp)
853                 {
854                     if (!first)
855                         errorMessage << ", or";
856                     first = false;
857 
858                     errorMessage << " InOut = " << e.m_inout << ", Output0 = " << e.m_output[0]
859                                  << ", Output1 = " << e.m_output[1];
860                 }
861 
862                 errorMessage << ". Got: InOut = " << resIo << ", Output0 = " << resOutput0
863                              << ", Output1 = " << resOutput1
864                              << ". Using Input0 = " << original.input[elementNdx * VecSize + vecIdx]
865                              << " and Input1 = " << original.input[(elementNdx + NUM_ELEMENTS / 2) * VecSize + vecIdx]
866                              << ".";
867 
868                 resultCollector.fail(errorMessage.str());
869             }
870         }
871     }
872 }
873 
874 class AtomicOperationCaseInstance : public TestInstance
875 {
876 public:
877     AtomicOperationCaseInstance(Context &context, const ShaderSpec &shaderSpec, AtomicShaderType shaderType,
878                                 DataType dataType, AtomicOperation atomicOp);
879 
880     virtual tcu::TestStatus iterate(void);
881 
882 private:
883     const ShaderSpec &m_shaderSpec;
884     AtomicShaderType m_shaderType;
885     const DataType m_dataType;
886     AtomicOperation m_atomicOp;
887 };
888 
AtomicOperationCaseInstance(Context & context,const ShaderSpec & shaderSpec,AtomicShaderType shaderType,DataType dataType,AtomicOperation atomicOp)889 AtomicOperationCaseInstance::AtomicOperationCaseInstance(Context &context, const ShaderSpec &shaderSpec,
890                                                          AtomicShaderType shaderType, DataType dataType,
891                                                          AtomicOperation atomicOp)
892     : TestInstance(context)
893     , m_shaderSpec(shaderSpec)
894     , m_shaderType(shaderType)
895     , m_dataType(dataType)
896     , m_atomicOp(atomicOp)
897 {
898 }
899 
iterate(void)900 tcu::TestStatus AtomicOperationCaseInstance::iterate(void)
901 {
902     de::UniquePtr<BufferInterface> testBuffer(createTestBuffer(m_dataType, m_atomicOp));
903     tcu::TestLog &log          = m_context.getTestContext().getLog();
904     const DeviceInterface &vkd = m_context.getDeviceInterface();
905     const VkDevice device      = m_context.getDevice();
906     de::Random rnd(0x62a15e34);
907     const bool useRef               = (m_shaderType.getMemoryType() == AtomicMemoryType::REFERENCE);
908     const VkDescriptorType descType = (useRef ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
909     const VkBufferUsageFlags usageFlags =
910         (VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
911          (useRef ? static_cast<VkBufferUsageFlags>(VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT) : 0u));
912 
913     // The main buffer will hold test data. When using buffer references, the buffer's address will be indirectly passed as part of
914     // a uniform buffer. If not, it will be passed directly as a descriptor.
915     Buffer buffer(m_context, usageFlags, testBuffer->bufferSize(), useRef);
916     std::unique_ptr<Buffer> auxBuffer;
917 
918     if (useRef)
919     {
920         // Pass the main buffer address inside a uniform buffer.
921         const VkBufferDeviceAddressInfo addressInfo = {
922             VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO, // VkStructureType sType;
923             nullptr,                                      // const void* pNext;
924             buffer.getBuffer(),                           // VkBuffer buffer;
925         };
926         const auto address = vkd.getBufferDeviceAddress(device, &addressInfo);
927 
928         auxBuffer.reset(new Buffer(m_context, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, sizeof(address), false));
929         deMemcpy(auxBuffer->getHostPtr(), &address, sizeof(address));
930         auxBuffer->flush();
931     }
932 
933     testBuffer->setBuffer(buffer.getHostPtr());
934     testBuffer->fillWithTestData(rnd);
935 
936     buffer.flush();
937 
938     Move<VkDescriptorSetLayout> extraResourcesLayout;
939     Move<VkDescriptorPool> extraResourcesSetPool;
940     Move<VkDescriptorSet> extraResourcesSet;
941 
942     const VkDescriptorSetLayoutBinding bindings[] = {{0u, descType, 1, VK_SHADER_STAGE_ALL, nullptr}};
943 
944     const VkDescriptorSetLayoutCreateInfo layoutInfo = {VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, nullptr,
945                                                         (VkDescriptorSetLayoutCreateFlags)0u,
946                                                         DE_LENGTH_OF_ARRAY(bindings), bindings};
947 
948     extraResourcesLayout = createDescriptorSetLayout(vkd, device, &layoutInfo);
949 
950     const VkDescriptorPoolSize poolSizes[] = {{descType, 1u}};
951 
952     const VkDescriptorPoolCreateInfo poolInfo = {
953         VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
954         nullptr,
955         (VkDescriptorPoolCreateFlags)VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
956         1u, // maxSets
957         DE_LENGTH_OF_ARRAY(poolSizes),
958         poolSizes};
959 
960     extraResourcesSetPool = createDescriptorPool(vkd, device, &poolInfo);
961 
962     const VkDescriptorSetAllocateInfo allocInfo = {VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, nullptr,
963                                                    *extraResourcesSetPool, 1u, &extraResourcesLayout.get()};
964 
965     extraResourcesSet = allocateDescriptorSet(vkd, device, &allocInfo);
966 
967     VkDescriptorBufferInfo bufferInfo;
968     bufferInfo.buffer = (useRef ? auxBuffer->getBuffer() : buffer.getBuffer());
969     bufferInfo.offset = 0u;
970     bufferInfo.range  = VK_WHOLE_SIZE;
971 
972     const VkWriteDescriptorSet descriptorWrite = {VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
973                                                   nullptr,
974                                                   *extraResourcesSet,
975                                                   0u, // dstBinding
976                                                   0u, // dstArrayElement
977                                                   1u,
978                                                   descType,
979                                                   nullptr,
980                                                   &bufferInfo,
981                                                   nullptr};
982 
983     vkd.updateDescriptorSets(device, 1u, &descriptorWrite, 0u, nullptr);
984 
985     // Storage for output varying data.
986     std::vector<uint32_t> outputs(NUM_ELEMENTS);
987     std::vector<void *> outputPtr(NUM_ELEMENTS);
988 
989     for (size_t i = 0; i < NUM_ELEMENTS; i++)
990     {
991         outputs[i]   = 0xcdcdcdcd;
992         outputPtr[i] = &outputs[i];
993     }
994 
995     const int numWorkGroups = (m_shaderType.isSharedLike() ? 1 : static_cast<int>(NUM_ELEMENTS));
996     UniquePtr<ShaderExecutor> executor(
997         createExecutor(m_context, m_shaderType.getType(), m_shaderSpec, *extraResourcesLayout));
998 
999     executor->execute(numWorkGroups, nullptr, &outputPtr[0], *extraResourcesSet);
1000     buffer.invalidate();
1001 
1002     tcu::ResultCollector resultCollector(log);
1003 
1004     // Check the results of the atomic operation
1005     testBuffer->checkResults(resultCollector);
1006 
1007     return tcu::TestStatus(resultCollector.getResult(), resultCollector.getMessage());
1008 }
1009 
1010 class AtomicOperationCase : public TestCase
1011 {
1012 public:
1013     AtomicOperationCase(tcu::TestContext &testCtx, const char *name, AtomicShaderType type, DataType dataType,
1014                         AtomicOperation atomicOp);
1015     virtual ~AtomicOperationCase(void);
1016 
1017     virtual TestInstance *createInstance(Context &ctx) const;
1018     virtual void checkSupport(Context &ctx) const;
initPrograms(vk::SourceCollections & programCollection) const1019     virtual void initPrograms(vk::SourceCollections &programCollection) const
1020     {
1021         const bool useSpv14   = m_shaderType.isMeshShadingStage();
1022         const auto spvVersion = (useSpv14 ? vk::SPIRV_VERSION_1_4 : vk::SPIRV_VERSION_1_0);
1023         const ShaderBuildOptions buildOptions(programCollection.usedVulkanVersion, spvVersion, 0u, useSpv14);
1024         ShaderSpec sourcesSpec(m_shaderSpec);
1025 
1026         sourcesSpec.buildOptions = buildOptions;
1027         generateSources(m_shaderType.getType(), sourcesSpec, programCollection);
1028     }
1029 
1030 private:
1031     void createShaderSpec();
1032     ShaderSpec m_shaderSpec;
1033     const AtomicShaderType m_shaderType;
1034     const DataType m_dataType;
1035     const AtomicOperation m_atomicOp;
1036 };
1037 
AtomicOperationCase(tcu::TestContext & testCtx,const char * name,AtomicShaderType shaderType,DataType dataType,AtomicOperation atomicOp)1038 AtomicOperationCase::AtomicOperationCase(tcu::TestContext &testCtx, const char *name, AtomicShaderType shaderType,
1039                                          DataType dataType, AtomicOperation atomicOp)
1040     : TestCase(testCtx, name)
1041     , m_shaderType(shaderType)
1042     , m_dataType(dataType)
1043     , m_atomicOp(atomicOp)
1044 {
1045     createShaderSpec();
1046     init();
1047 }
1048 
~AtomicOperationCase(void)1049 AtomicOperationCase::~AtomicOperationCase(void)
1050 {
1051 }
1052 
createInstance(Context & ctx) const1053 TestInstance *AtomicOperationCase::createInstance(Context &ctx) const
1054 {
1055     return new AtomicOperationCaseInstance(ctx, m_shaderSpec, m_shaderType, m_dataType, m_atomicOp);
1056 }
1057 
checkSupport(Context & ctx) const1058 void AtomicOperationCase::checkSupport(Context &ctx) const
1059 {
1060     if ((m_dataType == DATA_TYPE_INT64) || (m_dataType == DATA_TYPE_UINT64))
1061     {
1062         ctx.requireDeviceFunctionality("VK_KHR_shader_atomic_int64");
1063 
1064         const auto atomicInt64Features = ctx.getShaderAtomicInt64Features();
1065         const bool isSharedMemory      = m_shaderType.isSharedLike();
1066 
1067         if (!isSharedMemory && atomicInt64Features.shaderBufferInt64Atomics == VK_FALSE)
1068         {
1069             TCU_THROW(NotSupportedError,
1070                       "VkShaderAtomicInt64: 64-bit integer atomic operations not supported for buffers");
1071         }
1072         if (isSharedMemory && atomicInt64Features.shaderSharedInt64Atomics == VK_FALSE)
1073         {
1074             TCU_THROW(NotSupportedError,
1075                       "VkShaderAtomicInt64: 64-bit integer atomic operations not supported for shared memory");
1076         }
1077     }
1078 
1079     if (m_dataType == DATA_TYPE_FLOAT16)
1080     {
1081         ctx.requireDeviceFunctionality("VK_EXT_shader_atomic_float2");
1082 #ifndef CTS_USES_VULKANSC
1083         if (m_atomicOp == ATOMIC_OP_ADD)
1084         {
1085             if (m_shaderType.isSharedLike())
1086             {
1087                 if (!ctx.getShaderAtomicFloat2FeaturesEXT().shaderSharedFloat16AtomicAdd)
1088                 {
1089                     TCU_THROW(NotSupportedError,
1090                               "VkShaderAtomicFloat16: 16-bit floating point shared add atomic operation not supported");
1091                 }
1092             }
1093             else
1094             {
1095                 if (!ctx.getShaderAtomicFloat2FeaturesEXT().shaderBufferFloat16AtomicAdd)
1096                 {
1097                     TCU_THROW(NotSupportedError,
1098                               "VkShaderAtomicFloat16: 16-bit floating point buffer add atomic operation not supported");
1099                 }
1100             }
1101         }
1102         if (m_atomicOp == ATOMIC_OP_MIN || m_atomicOp == ATOMIC_OP_MAX)
1103         {
1104             if (m_shaderType.isSharedLike())
1105             {
1106                 if (!ctx.getShaderAtomicFloat2FeaturesEXT().shaderSharedFloat16AtomicMinMax)
1107                 {
1108                     TCU_THROW(
1109                         NotSupportedError,
1110                         "VkShaderAtomicFloat16: 16-bit floating point shared min/max atomic operation not supported");
1111                 }
1112             }
1113             else
1114             {
1115                 if (!ctx.getShaderAtomicFloat2FeaturesEXT().shaderBufferFloat16AtomicMinMax)
1116                 {
1117                     TCU_THROW(
1118                         NotSupportedError,
1119                         "VkShaderAtomicFloat16: 16-bit floating point buffer min/max atomic operation not supported");
1120                 }
1121             }
1122         }
1123         if (m_atomicOp == ATOMIC_OP_EXCHANGE)
1124         {
1125             if (m_shaderType.isSharedLike())
1126             {
1127                 if (!ctx.getShaderAtomicFloat2FeaturesEXT().shaderSharedFloat16Atomics)
1128                 {
1129                     TCU_THROW(NotSupportedError,
1130                               "VkShaderAtomicFloat16: 16-bit floating point shared atomic operations not supported");
1131                 }
1132             }
1133             else
1134             {
1135                 if (!ctx.getShaderAtomicFloat2FeaturesEXT().shaderBufferFloat16Atomics)
1136                 {
1137                     TCU_THROW(NotSupportedError,
1138                               "VkShaderAtomicFloat16: 16-bit floating point buffer atomic operations not supported");
1139                 }
1140             }
1141         }
1142 #endif // CTS_USES_VULKANSC
1143     }
1144 
1145 #ifndef CTS_USES_VULKANSC
1146     if (m_dataType == DATA_TYPE_FLOAT16X2 || m_dataType == DATA_TYPE_FLOAT16X4)
1147     {
1148         ctx.requireDeviceFunctionality("VK_NV_shader_atomic_float16_vector");
1149         if (!ctx.getShaderAtomicFloat16VectorFeaturesNV().shaderFloat16VectorAtomics)
1150         {
1151             TCU_THROW(NotSupportedError, "16-bit floating point vector atomic operations not supported");
1152         }
1153     }
1154 #endif // CTS_USES_VULKANSC
1155 
1156     if (m_dataType == DATA_TYPE_FLOAT32)
1157     {
1158         ctx.requireDeviceFunctionality("VK_EXT_shader_atomic_float");
1159         if (m_atomicOp == ATOMIC_OP_ADD)
1160         {
1161             if (m_shaderType.isSharedLike())
1162             {
1163                 if (!ctx.getShaderAtomicFloatFeaturesEXT().shaderSharedFloat32AtomicAdd)
1164                 {
1165                     TCU_THROW(NotSupportedError,
1166                               "VkShaderAtomicFloat32: 32-bit floating point shared add atomic operation not supported");
1167                 }
1168             }
1169             else
1170             {
1171                 if (!ctx.getShaderAtomicFloatFeaturesEXT().shaderBufferFloat32AtomicAdd)
1172                 {
1173                     TCU_THROW(NotSupportedError,
1174                               "VkShaderAtomicFloat32: 32-bit floating point buffer add atomic operation not supported");
1175                 }
1176             }
1177         }
1178         if (m_atomicOp == ATOMIC_OP_MIN || m_atomicOp == ATOMIC_OP_MAX)
1179         {
1180             ctx.requireDeviceFunctionality("VK_EXT_shader_atomic_float2");
1181 #ifndef CTS_USES_VULKANSC
1182             if (m_shaderType.isSharedLike())
1183             {
1184                 if (!ctx.getShaderAtomicFloat2FeaturesEXT().shaderSharedFloat32AtomicMinMax)
1185                 {
1186                     TCU_THROW(
1187                         NotSupportedError,
1188                         "VkShaderAtomicFloat32: 32-bit floating point shared min/max atomic operation not supported");
1189                 }
1190             }
1191             else
1192             {
1193                 if (!ctx.getShaderAtomicFloat2FeaturesEXT().shaderBufferFloat32AtomicMinMax)
1194                 {
1195                     TCU_THROW(
1196                         NotSupportedError,
1197                         "VkShaderAtomicFloat32: 32-bit floating point buffer min/max atomic operation not supported");
1198                 }
1199             }
1200 #endif // CTS_USES_VULKANSC
1201         }
1202         if (m_atomicOp == ATOMIC_OP_EXCHANGE)
1203         {
1204             if (m_shaderType.isSharedLike())
1205             {
1206                 if (!ctx.getShaderAtomicFloatFeaturesEXT().shaderSharedFloat32Atomics)
1207                 {
1208                     TCU_THROW(NotSupportedError,
1209                               "VkShaderAtomicFloat32: 32-bit floating point shared atomic operations not supported");
1210                 }
1211             }
1212             else
1213             {
1214                 if (!ctx.getShaderAtomicFloatFeaturesEXT().shaderBufferFloat32Atomics)
1215                 {
1216                     TCU_THROW(NotSupportedError,
1217                               "VkShaderAtomicFloat32: 32-bit floating point buffer atomic operations not supported");
1218                 }
1219             }
1220         }
1221     }
1222 
1223     if (m_dataType == DATA_TYPE_FLOAT64)
1224     {
1225         ctx.requireDeviceFunctionality("VK_EXT_shader_atomic_float");
1226         if (m_atomicOp == ATOMIC_OP_ADD)
1227         {
1228             if (m_shaderType.isSharedLike())
1229             {
1230                 if (!ctx.getShaderAtomicFloatFeaturesEXT().shaderSharedFloat64AtomicAdd)
1231                 {
1232                     TCU_THROW(NotSupportedError,
1233                               "VkShaderAtomicFloat64: 64-bit floating point shared add atomic operation not supported");
1234                 }
1235             }
1236             else
1237             {
1238                 if (!ctx.getShaderAtomicFloatFeaturesEXT().shaderBufferFloat64AtomicAdd)
1239                 {
1240                     TCU_THROW(NotSupportedError,
1241                               "VkShaderAtomicFloat64: 64-bit floating point buffer add atomic operation not supported");
1242                 }
1243             }
1244         }
1245         if (m_atomicOp == ATOMIC_OP_MIN || m_atomicOp == ATOMIC_OP_MAX)
1246         {
1247             ctx.requireDeviceFunctionality("VK_EXT_shader_atomic_float2");
1248 #ifndef CTS_USES_VULKANSC
1249             if (m_shaderType.isSharedLike())
1250             {
1251                 if (!ctx.getShaderAtomicFloat2FeaturesEXT().shaderSharedFloat64AtomicMinMax)
1252                 {
1253                     TCU_THROW(
1254                         NotSupportedError,
1255                         "VkShaderAtomicFloat64: 64-bit floating point shared min/max atomic operation not supported");
1256                 }
1257             }
1258             else
1259             {
1260                 if (!ctx.getShaderAtomicFloat2FeaturesEXT().shaderBufferFloat64AtomicMinMax)
1261                 {
1262                     TCU_THROW(
1263                         NotSupportedError,
1264                         "VkShaderAtomicFloat64: 64-bit floating point buffer min/max atomic operation not supported");
1265                 }
1266             }
1267 #endif // CTS_USES_VULKANSC
1268         }
1269         if (m_atomicOp == ATOMIC_OP_EXCHANGE)
1270         {
1271             if (m_shaderType.isSharedLike())
1272             {
1273                 if (!ctx.getShaderAtomicFloatFeaturesEXT().shaderSharedFloat64Atomics)
1274                 {
1275                     TCU_THROW(NotSupportedError,
1276                               "VkShaderAtomicFloat64: 64-bit floating point shared atomic operations not supported");
1277                 }
1278             }
1279             else
1280             {
1281                 if (!ctx.getShaderAtomicFloatFeaturesEXT().shaderBufferFloat64Atomics)
1282                 {
1283                     TCU_THROW(NotSupportedError,
1284                               "VkShaderAtomicFloat64: 64-bit floating point buffer atomic operations not supported");
1285                 }
1286             }
1287         }
1288     }
1289 
1290     if (m_shaderType.getMemoryType() == AtomicMemoryType::REFERENCE)
1291     {
1292         ctx.requireDeviceFunctionality("VK_KHR_buffer_device_address");
1293     }
1294 
1295     checkSupportShader(ctx, m_shaderType.getType());
1296 }
1297 
createShaderSpec(void)1298 void AtomicOperationCase::createShaderSpec(void)
1299 {
1300     const AtomicMemoryType memoryType = m_shaderType.getMemoryType();
1301     const bool isSharedLike           = m_shaderType.isSharedLike();
1302 
1303     // Global declarations.
1304     std::ostringstream shaderTemplateGlobalStream;
1305 
1306     // Structure in use for atomic operations.
1307     shaderTemplateGlobalStream << "${EXTENSIONS}\n"
1308                                << "\n"
1309                                << "struct AtomicStruct\n"
1310                                << "{\n"
1311                                << "    ${DATATYPE} inoutValues[${N}/2];\n"
1312                                << "    ${DATATYPE} inputValues[${N}];\n"
1313                                << "    ${DATATYPE} compareValues[${N}];\n"
1314                                << "    ${DATATYPE} outputValues[${N}];\n"
1315                                << "    int invocationHitCount[${N}];\n"
1316                                << "    int index;\n"
1317                                << "};\n"
1318                                << "\n";
1319 
1320     // The name dance and declarations below will make sure the structure that will be used with atomic operations can be accessed
1321     // as "buf.data", which is the name used in the atomic operation statements.
1322     //
1323     // * When using a buffer directly, RESULT_BUFFER_NAME will be "buf" and the inner struct will be "data".
1324     // * When using a workgroup-shared global variable, the "data" struct will be nested in an auxiliar "buf" struct.
1325     // * When using buffer references, the uniform buffer reference will be called "buf" and its contents "data".
1326     //
1327     if (memoryType != AtomicMemoryType::REFERENCE)
1328     {
1329         shaderTemplateGlobalStream << "layout (set = ${SETIDX}, binding = 0) buffer AtomicBuffer {\n"
1330                                    << "    AtomicStruct data;\n"
1331                                    << "} ${RESULT_BUFFER_NAME};\n"
1332                                    << "\n";
1333 
1334         // When using global shared memory in the compute, task or mesh variants, invocations will use a shared global structure
1335         // instead of a descriptor set as the sources and results of each tested operation.
1336         if (memoryType == AtomicMemoryType::SHARED)
1337         {
1338             shaderTemplateGlobalStream << "shared struct { AtomicStruct data; } buf;\n"
1339                                        << "\n";
1340         }
1341         else if (memoryType == AtomicMemoryType::PAYLOAD)
1342         {
1343             shaderTemplateGlobalStream << "struct TaskData { AtomicStruct data; };\n"
1344                                        << "taskPayloadSharedEXT TaskData buf;\n";
1345         }
1346     }
1347     else
1348     {
1349         shaderTemplateGlobalStream << "layout (buffer_reference) buffer AtomicBuffer {\n"
1350                                    << "    AtomicStruct data;\n"
1351                                    << "};\n"
1352                                    << "\n"
1353                                    << "layout (set = ${SETIDX}, binding = 0) uniform References {\n"
1354                                    << "    AtomicBuffer buf;\n"
1355                                    << "};\n"
1356                                    << "\n";
1357     }
1358 
1359     const auto shaderTemplateGlobalString = shaderTemplateGlobalStream.str();
1360     const tcu::StringTemplate shaderTemplateGlobal(shaderTemplateGlobalString);
1361 
1362     // Shader body for the non-vertex case.
1363     std::ostringstream nonVertexShaderTemplateStream;
1364 
1365     if (isSharedLike)
1366     {
1367         // Invocation zero will initialize the shared structure from the descriptor set.
1368         nonVertexShaderTemplateStream << "if (gl_LocalInvocationIndex == 0u)\n"
1369                                       << "{\n"
1370                                       << "    buf.data = ${RESULT_BUFFER_NAME}.data;\n"
1371                                       << "}\n"
1372                                       << "barrier();\n";
1373     }
1374 
1375     if (m_shaderType.getType() == glu::SHADERTYPE_FRAGMENT)
1376     {
1377         nonVertexShaderTemplateStream << "if (!gl_HelperInvocation) {\n"
1378                                       << "    int idx = atomicAdd(buf.data.index, 1);\n"
1379                                       << "    buf.data.outputValues[idx] = ${ATOMICOP}(buf.data.inoutValues[idx % "
1380                                          "(${N}/2)], ${COMPARE_ARG}buf.data.inputValues[idx]);\n"
1381                                       << "}\n";
1382     }
1383     else
1384     {
1385         nonVertexShaderTemplateStream << "if (atomicAdd(buf.data.invocationHitCount[0], 1) < ${N})\n"
1386                                       << "{\n"
1387                                       << "    int idx = atomicAdd(buf.data.index, 1);\n"
1388                                       << "    buf.data.outputValues[idx] = ${ATOMICOP}(buf.data.inoutValues[idx % "
1389                                          "(${N}/2)], ${COMPARE_ARG}buf.data.inputValues[idx]);\n"
1390                                       << "}\n";
1391     }
1392 
1393     if (isSharedLike)
1394     {
1395         // Invocation zero will copy results back to the descriptor set.
1396         nonVertexShaderTemplateStream << "barrier();\n"
1397                                       << "if (gl_LocalInvocationIndex == 0u)\n"
1398                                       << "{\n"
1399                                       << "    ${RESULT_BUFFER_NAME}.data = buf.data;\n"
1400                                       << "}\n";
1401     }
1402 
1403     const auto nonVertexShaderTemplateStreamStr = nonVertexShaderTemplateStream.str();
1404     const tcu::StringTemplate nonVertexShaderTemplateSrc(nonVertexShaderTemplateStreamStr);
1405 
1406     // Shader body for the vertex case.
1407     const tcu::StringTemplate vertexShaderTemplateSrc(
1408         "int idx = gl_VertexIndex;\n"
1409         "if (atomicAdd(buf.data.invocationHitCount[idx], 1) == 0)\n"
1410         "{\n"
1411         "    buf.data.outputValues[idx] = ${ATOMICOP}(buf.data.inoutValues[idx % (${N}/2)], "
1412         "${COMPARE_ARG}buf.data.inputValues[idx]);\n"
1413         "}\n");
1414 
1415     // Extensions.
1416     std::ostringstream extensions;
1417 
1418     if ((m_dataType == DATA_TYPE_INT64) || (m_dataType == DATA_TYPE_UINT64))
1419     {
1420         extensions << "#extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable\n"
1421                    << "#extension GL_EXT_shader_atomic_int64 : enable\n";
1422     }
1423     else if ((m_dataType == DATA_TYPE_FLOAT16) || (m_dataType == DATA_TYPE_FLOAT16X2) ||
1424              (m_dataType == DATA_TYPE_FLOAT16X4) || (m_dataType == DATA_TYPE_FLOAT32) ||
1425              (m_dataType == DATA_TYPE_FLOAT64))
1426     {
1427         extensions << "#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable\n"
1428                    << "#extension GL_EXT_shader_atomic_float : enable\n"
1429                    << "#extension GL_EXT_shader_atomic_float2 : enable\n"
1430                    << "#extension GL_KHR_memory_scope_semantics : enable\n";
1431         if (m_dataType == DATA_TYPE_FLOAT16X2 || m_dataType == DATA_TYPE_FLOAT16X4)
1432         {
1433             extensions << "#extension GL_NV_shader_atomic_fp16_vector : require\n";
1434         }
1435     }
1436 
1437     if (memoryType == AtomicMemoryType::REFERENCE)
1438     {
1439         extensions << "#extension GL_EXT_buffer_reference : require\n";
1440     }
1441 
1442     // Specializations.
1443     std::map<std::string, std::string> specializations;
1444 
1445     specializations["EXTENSIONS"]  = extensions.str();
1446     specializations["DATATYPE"]    = dataType2Str(m_dataType);
1447     specializations["ATOMICOP"]    = atomicOp2Str(m_atomicOp);
1448     specializations["SETIDX"]      = de::toString((int)EXTRA_RESOURCES_DESCRIPTOR_SET_INDEX);
1449     specializations["N"]           = de::toString((int)NUM_ELEMENTS);
1450     specializations["COMPARE_ARG"] = ((m_atomicOp == ATOMIC_OP_COMP_SWAP) ? "buf.data.compareValues[idx], " : "");
1451     specializations["RESULT_BUFFER_NAME"] = (isSharedLike ? "result" : "buf");
1452 
1453     // Shader spec.
1454     m_shaderSpec.outputs.push_back(Symbol("outData", glu::VarType(glu::TYPE_UINT, glu::PRECISION_HIGHP)));
1455     m_shaderSpec.glslVersion        = glu::GLSL_VERSION_450;
1456     m_shaderSpec.globalDeclarations = shaderTemplateGlobal.specialize(specializations);
1457     m_shaderSpec.source =
1458         ((m_shaderType.getType() == glu::SHADERTYPE_VERTEX) ? vertexShaderTemplateSrc.specialize(specializations) :
1459                                                               nonVertexShaderTemplateSrc.specialize(specializations));
1460 
1461     if (isSharedLike)
1462     {
1463         // When using global shared memory, use a single workgroup and an appropriate number of local invocations.
1464         m_shaderSpec.localSizeX = static_cast<int>(NUM_ELEMENTS);
1465     }
1466 }
1467 
addAtomicOperationTests(tcu::TestCaseGroup * atomicOperationTestsGroup)1468 void addAtomicOperationTests(tcu::TestCaseGroup *atomicOperationTestsGroup)
1469 {
1470     tcu::TestContext &testCtx = atomicOperationTestsGroup->getTestContext();
1471 
1472     static const struct
1473     {
1474         glu::ShaderType type;
1475         const char *name;
1476     } shaderTypes[] = {
1477         {glu::SHADERTYPE_VERTEX, "vertex"},
1478         {glu::SHADERTYPE_FRAGMENT, "fragment"},
1479         {glu::SHADERTYPE_GEOMETRY, "geometry"},
1480         {glu::SHADERTYPE_TESSELLATION_CONTROL, "tess_ctrl"},
1481         {glu::SHADERTYPE_TESSELLATION_EVALUATION, "tess_eval"},
1482         {glu::SHADERTYPE_COMPUTE, "compute"},
1483         {glu::SHADERTYPE_TASK, "task"},
1484         {glu::SHADERTYPE_MESH, "mesh"},
1485     };
1486 
1487     static const struct
1488     {
1489         AtomicMemoryType type;
1490         const char *suffix;
1491     } kMemoryTypes[] = {
1492         {AtomicMemoryType::BUFFER, ""},
1493         {AtomicMemoryType::SHARED, "_shared"},
1494         {AtomicMemoryType::REFERENCE, "_reference"},
1495         {AtomicMemoryType::PAYLOAD, "_payload"},
1496     };
1497 
1498     static const struct
1499     {
1500         DataType dataType;
1501         const char *name;
1502     } dataSign[] = {
1503 #ifndef CTS_USES_VULKANSC
1504         // Tests using 16-bit float data
1505         {DATA_TYPE_FLOAT16, "float16"},
1506         // Tests using f16vec2 data
1507         {DATA_TYPE_FLOAT16X2, "f16vec2"},
1508         // Tests using f16vec4 data
1509         {DATA_TYPE_FLOAT16X4, "f16vec4"},
1510 #endif // CTS_USES_VULKANSC
1511         // Tests using signed data (int)
1512         {DATA_TYPE_INT32, "signed"},
1513         // Tests using unsigned data (uint)
1514         {DATA_TYPE_UINT32, "unsigned"},
1515         // Tests using 32-bit float data
1516         {DATA_TYPE_FLOAT32, "float32"},
1517         // Tests using 64 bit signed data (int64)
1518         {DATA_TYPE_INT64, "signed64bit"},
1519         // Tests using 64 bit unsigned data (uint64)
1520         {DATA_TYPE_UINT64, "unsigned64bit"},
1521         // Tests using 64-bit float data)
1522         {DATA_TYPE_FLOAT64, "float64"}};
1523 
1524     static const struct
1525     {
1526         AtomicOperation value;
1527         const char *name;
1528     } atomicOp[] = {{ATOMIC_OP_EXCHANGE, "exchange"},
1529                     {ATOMIC_OP_COMP_SWAP, "comp_swap"},
1530                     {ATOMIC_OP_ADD, "add"},
1531                     {ATOMIC_OP_MIN, "min"},
1532                     {ATOMIC_OP_MAX, "max"},
1533                     {ATOMIC_OP_AND, "and"},
1534                     {ATOMIC_OP_OR, "or"},
1535                     {ATOMIC_OP_XOR, "xor"}};
1536 
1537     for (int opNdx = 0; opNdx < DE_LENGTH_OF_ARRAY(atomicOp); opNdx++)
1538     {
1539         for (int signNdx = 0; signNdx < DE_LENGTH_OF_ARRAY(dataSign); signNdx++)
1540         {
1541             for (int shaderTypeNdx = 0; shaderTypeNdx < DE_LENGTH_OF_ARRAY(shaderTypes); shaderTypeNdx++)
1542             {
1543                 // Only ADD and EXCHANGE are supported on floating-point
1544                 if (dataSign[signNdx].dataType == DATA_TYPE_FLOAT16 ||
1545                     dataSign[signNdx].dataType == DATA_TYPE_FLOAT16X2 ||
1546                     dataSign[signNdx].dataType == DATA_TYPE_FLOAT16X4 ||
1547                     dataSign[signNdx].dataType == DATA_TYPE_FLOAT32 || dataSign[signNdx].dataType == DATA_TYPE_FLOAT64)
1548                 {
1549                     if (atomicOp[opNdx].value != ATOMIC_OP_ADD &&
1550 #ifndef CTS_USES_VULKANSC
1551                         atomicOp[opNdx].value != ATOMIC_OP_MIN && atomicOp[opNdx].value != ATOMIC_OP_MAX &&
1552 #endif // CTS_USES_VULKANSC
1553                         atomicOp[opNdx].value != ATOMIC_OP_EXCHANGE)
1554                     {
1555                         continue;
1556                     }
1557                 }
1558 
1559                 for (int memoryTypeNdx = 0; memoryTypeNdx < DE_LENGTH_OF_ARRAY(kMemoryTypes); ++memoryTypeNdx)
1560                 {
1561                     // Shared memory only available in compute, task and mesh shaders.
1562                     if (kMemoryTypes[memoryTypeNdx].type == AtomicMemoryType::SHARED &&
1563                         shaderTypes[shaderTypeNdx].type != glu::SHADERTYPE_COMPUTE &&
1564                         shaderTypes[shaderTypeNdx].type != glu::SHADERTYPE_TASK &&
1565                         shaderTypes[shaderTypeNdx].type != glu::SHADERTYPE_MESH)
1566                         continue;
1567 
1568                     // Payload memory is only available for atomics in task shaders (in mesh shaders it's read-only)
1569                     if (kMemoryTypes[memoryTypeNdx].type == AtomicMemoryType::PAYLOAD &&
1570                         shaderTypes[shaderTypeNdx].type != glu::SHADERTYPE_TASK)
1571                         continue;
1572 
1573                     const std::string name =
1574                         std::string(atomicOp[opNdx].name) + "_" + std::string(dataSign[signNdx].name) + "_" +
1575                         std::string(shaderTypes[shaderTypeNdx].name) + kMemoryTypes[memoryTypeNdx].suffix;
1576 
1577                     atomicOperationTestsGroup->addChild(new AtomicOperationCase(
1578                         testCtx, name.c_str(),
1579                         AtomicShaderType(shaderTypes[shaderTypeNdx].type, kMemoryTypes[memoryTypeNdx].type),
1580                         dataSign[signNdx].dataType, atomicOp[opNdx].value));
1581                 }
1582             }
1583         }
1584     }
1585 }
1586 
1587 } // namespace
1588 
createAtomicOperationTests(tcu::TestContext & testCtx)1589 tcu::TestCaseGroup *createAtomicOperationTests(tcu::TestContext &testCtx)
1590 {
1591     return createTestGroup(testCtx, "atomic_operations", addAtomicOperationTests);
1592 }
1593 
1594 } // namespace shaderexecutor
1595 } // namespace vkt
1596