• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*------------------------------------------------------------------------
2  * Vulkan Conformance Tests
3  * ------------------------
4  *
5  * Copyright (c) 2019 The Khronos Group Inc.
6  * Copyright (c) 2018-2020 NVIDIA Corporation
7  *
8  * Licensed under the Apache License, Version 2.0 (the "Licensehelper
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  *
12  *      http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  *
20  * \file
21  * \brief Vulkan Reconvergence tests
22  *//*--------------------------------------------------------------------*/
23 
24 #include "vktReconvergenceTests.hpp"
25 
26 #include "vkBufferWithMemory.hpp"
27 #include "vkImageWithMemory.hpp"
28 #include "vkQueryUtil.hpp"
29 #include "vkBuilderUtil.hpp"
30 #include "vkCmdUtil.hpp"
31 #include "vkTypeUtil.hpp"
32 #include "vkObjUtil.hpp"
33 
34 #include "vktTestGroupUtil.hpp"
35 #include "vktTestCase.hpp"
36 #include "vktAmberTestCase.hpp"
37 
38 #include "deDefs.h"
39 #include "deFloat16.h"
40 #include "deMath.h"
41 #include "deRandom.h"
42 #include "deSharedPtr.hpp"
43 #include "deString.h"
44 
45 #include "tcuTestCase.hpp"
46 #include "tcuTestLog.hpp"
47 
48 #include <array>
49 #include <bitset>
50 #include <functional>
51 #include <map>
52 #include <numeric>
53 #include <random>
54 #include <string>
55 #include <sstream>
56 #include <set>
57 #include <type_traits>
58 #include <vector>
59 #include <memory>
60 #include <cmath>
61 #include <initializer_list>
62 
63 #include <iostream>
64 
65 // #define INCLUDE_GRAPHICS_TESTS
66 
67 namespace vkt
68 {
69 namespace Reconvergence
70 {
71 namespace
72 {
73 using namespace vk;
74 using namespace std;
75 
76 #define ARRAYSIZE(x) (sizeof(x) / sizeof(x[0]))
77 #define ROUNDUP(x__, multipler__) ((((x__) + ((multipler__)-1)) / (multipler__)) * (multipler__))
78 #define ROUNDDOWN(x__, multipler__) (((x__) / (multipler__)) * (multipler__))
79 constexpr uint32_t MAX_INVOCATIONS_ALL_TESTS = 64 * 64;
80 typedef std::bitset<MAX_INVOCATIONS_ALL_TESTS> bitset_inv_t;
81 //constexpr bitset_inv_t MAGIC_BALLOT = 0x12345678;
82 
83 typedef enum
84 {
85     TT_SUCF_ELECT,  // subgroup_uniform_control_flow using elect (subgroup_basic)
86     TT_SUCF_BALLOT, // subgroup_uniform_control_flow using ballot (subgroup_ballot)
87     TT_WUCF_ELECT,  // workgroup uniform control flow using elect (subgroup_basic)
88     TT_WUCF_BALLOT, // workgroup uniform control flow using ballot (subgroup_ballot)
89     TT_MAXIMAL,     // maximal reconvergence
90 } TestType;
91 
92 static_assert(VK_TRUE == 1, "VK_TRUE must equal 1");
93 
94 struct CaseDef
95 {
96     VkShaderStageFlagBits shaderStage;
97     TestType testType;
98     uint32_t maxNesting;
99     uint32_t seed;
100     // In the case of compute shader below sizes would be local_size_x and local_size_y respectively.
101     // In the case of fragment shader these sizes would define framebuffer dimensions.
102     uint32_t sizeX;
103     uint32_t sizeY;
104 
isWUCFvkt::Reconvergence::__anone030def80111::CaseDef105     bool isWUCF() const
106     {
107         return testType == TT_WUCF_ELECT || testType == TT_WUCF_BALLOT;
108     }
isSUCFvkt::Reconvergence::__anone030def80111::CaseDef109     bool isSUCF() const
110     {
111         return testType == TT_SUCF_ELECT || testType == TT_SUCF_BALLOT;
112     }
isUCFvkt::Reconvergence::__anone030def80111::CaseDef113     bool isUCF() const
114     {
115         return isWUCF() || isSUCF();
116     }
isElectvkt::Reconvergence::__anone030def80111::CaseDef117     bool isElect() const
118     {
119         return testType == TT_WUCF_ELECT || testType == TT_SUCF_ELECT;
120     }
121 
verifyvkt::Reconvergence::__anone030def80111::CaseDef122     bool verify() const
123     {
124         return (sizeX * sizeY) <= MAX_INVOCATIONS_ALL_TESTS;
125     }
126 };
127 
128 template <class T, class P = T (*)[1], class R = decltype(std::begin(*std::declval<P>()))>
makeStdBeginEnd(void * p,uint32_t n)129 static auto makeStdBeginEnd(void *p, uint32_t n) -> std::pair<R, R>
130 {
131     auto tmp   = std::begin(*P(p));
132     auto begin = tmp;
133     std::advance(tmp, n);
134     return {begin, tmp};
135 }
136 
137 template <class R>
138 using add_ref = typename std::add_lvalue_reference<R>::type;
139 template <class R>
140 using add_cref = typename std::add_lvalue_reference<typename std::add_const<R>::type>::type;
141 template <class X>
142 using add_ptr = std::add_pointer_t<X>;
143 template <class X>
144 using add_cptr = std::add_pointer_t<std::add_const_t<X>>;
145 
146 template <class RndIter>
max_element(RndIter first,RndIter last)147 RndIter max_element(RndIter first, RndIter last)
148 {
149     RndIter max = last;
150     if (first != last)
151     {
152         for (max = first, ++first; first != last; ++first)
153         {
154             if (*first > *max)
155                 max = first;
156         }
157     }
158     return max;
159 }
160 
161 template <class RndIter, class Selector>
max_element(RndIter first,RndIter last,Selector selector)162 RndIter max_element(RndIter first, RndIter last, Selector selector)
163 {
164     RndIter max = last;
165     if (first != last)
166     {
167         for (max = first, ++first; first != last; ++first)
168         {
169             if (selector(*first) > selector(*max))
170                 max = first;
171         }
172     }
173     return max;
174 }
175 
176 struct Ballot : public std::bitset<128>
177 {
178     typedef std::bitset<128> super;
Ballotvkt::Reconvergence::__anone030def80111::Ballot179     Ballot() : super()
180     {
181     }
Ballotvkt::Reconvergence::__anone030def80111::Ballot182     Ballot(add_cref<super> ballot, uint32_t printbits = 128u) : super(ballot), m_bits(printbits)
183     {
184     }
Ballotvkt::Reconvergence::__anone030def80111::Ballot185     Ballot(add_cref<tcu::UVec4> ballot, uint32_t printbits = 128u) : super(), m_bits(printbits)
186     {
187         *this = ballot;
188     }
Ballotvkt::Reconvergence::__anone030def80111::Ballot189     Ballot(uint64_t val, uint32_t printbits = 128u) : super(val), m_bits(printbits)
190     {
191     }
withSetBitvkt::Reconvergence::__anone030def80111::Ballot192     static Ballot withSetBit(uint32_t bit)
193     {
194         Ballot b;
195         b.set(bit);
196         return b;
197     }
sizevkt::Reconvergence::__anone030def80111::Ballot198     constexpr uint32_t size() const
199     {
200         return static_cast<uint32_t>(super::size());
201     }
operator tcu::UVec4vkt::Reconvergence::__anone030def80111::Ballot202     operator tcu::UVec4() const
203     {
204         tcu::UVec4 result;
205         super ballot(*this);
206         const super mask = 0xFFFFFFFF;
207         for (uint32_t k = 0; k < 4u; ++k)
208         {
209             result[k] = uint32_t((ballot & mask).to_ulong());
210             ballot >>= 32;
211         }
212         return result;
213     }
operator =vkt::Reconvergence::__anone030def80111::Ballot214     add_ref<Ballot> operator=(add_cref<tcu::UVec4> vec)
215     {
216         for (uint32_t k = 0; k < 4u; ++k)
217         {
218             (*this) <<= 32;
219             (*this) |= vec[3 - k];
220         }
221         return *this;
222     }
getwvkt::Reconvergence::__anone030def80111::Ballot223     DE_UNUSED_FUNCTION uint32_t getw() const
224     {
225         return m_bits;
226     }
setwvkt::Reconvergence::__anone030def80111::Ballot227     DE_UNUSED_FUNCTION void setw(uint32_t bits)
228     {
229         m_bits = bits;
230     }
operator <<(add_ref<std::ostream> str,add_cref<Ballot> ballot)231     DE_UNUSED_FUNCTION friend add_ref<std::ostream> operator<<(add_ref<std::ostream> str, add_cref<Ballot> ballot)
232     {
233         for (uint32_t i = 0u; i < ballot.m_bits && i < 128u; ++i)
234         {
235             str << (ballot[ballot.m_bits - i - 1u] ? '1' : '0');
236         }
237         return str;
238     }
239 
240 protected:
241     uint32_t m_bits;
242 };
243 
244 struct Ballots : protected std::vector<std::bitset<128>>
245 {
246     typedef std::vector<value_type> super;
247     static const constexpr uint32_t subgroupInvocationSize = static_cast<uint32_t>(value_type().size());
Ballotsvkt::Reconvergence::__anone030def80111::Ballots248     Ballots() : super()
249     {
250     }
Ballotsvkt::Reconvergence::__anone030def80111::Ballots251     explicit Ballots(uint32_t subgroupCount, add_cref<value_type> ballot = {}) : super(subgroupCount)
252     {
253         if (ballot.any())
254             *this = ballot;
255     }
Ballotsvkt::Reconvergence::__anone030def80111::Ballots256     Ballots(add_cref<Ballots> other) : super(upcast(other))
257     {
258     }
Ballotsvkt::Reconvergence::__anone030def80111::Ballots259     Ballots(Ballots &&other) : super(std::move(other))
260     {
261     }
262     using super::operator[];
263     using super::at;
264     /**
265      * @brief size method
266      * @return Returns the number of bits that the Ballots holds.
267      */
sizevkt::Reconvergence::__anone030def80111::Ballots268     uint32_t size() const
269     {
270         return static_cast<uint32_t>(super::size() * subgroupInvocationSize);
271     }
272     /**
273      * @brief count method
274      * @return Returns the number of bits that are set to true.
275      */
countvkt::Reconvergence::__anone030def80111::Ballots276     uint32_t count() const
277     {
278         uint32_t n = 0u;
279         for (add_cref<value_type> b : *this)
280             n += static_cast<uint32_t>(b.count());
281         return n;
282     }
283     /**
284      * @brief count method
285      * @return Returns the number of bits that are set to true in given subgroup.
286      */
countvkt::Reconvergence::__anone030def80111::Ballots287     uint32_t count(uint32_t subgroup) const
288     {
289         DE_ASSERT(subgroup < subgroupCount());
290         return static_cast<uint32_t>(at(subgroup).count());
291     }
subgroupCountvkt::Reconvergence::__anone030def80111::Ballots292     uint32_t subgroupCount() const
293     {
294         return static_cast<uint32_t>(super::size());
295     }
testvkt::Reconvergence::__anone030def80111::Ballots296     bool test(uint32_t bit) const
297     {
298         DE_ASSERT(bit < size());
299         return at(bit / subgroupInvocationSize).test(bit % subgroupInvocationSize);
300     }
setvkt::Reconvergence::__anone030def80111::Ballots301     bool set(uint32_t bit, bool value = true)
302     {
303         DE_ASSERT(bit <= size());
304         const bool before = test(bit);
305         at(bit / subgroupInvocationSize).set((bit % subgroupInvocationSize), value);
306         return before;
307     }
fullvkt::Reconvergence::__anone030def80111::Ballots308     void full()
309     {
310         const uint32_t bb = size();
311         for (uint32_t b = 0u; b < bb; ++b)
312             set(b);
313     }
setnvkt::Reconvergence::__anone030def80111::Ballots314     add_ref<Ballots> setn(uint32_t bits)
315     {
316         for (uint32_t i = 0u; i < bits; ++i)
317             set(i);
318         return *this;
319     }
allvkt::Reconvergence::__anone030def80111::Ballots320     bool all() const
321     {
322         const uint32_t gg = subgroupCount();
323         for (uint32_t g = 0u; g < gg; ++g)
324         {
325             if (false == at(g).all())
326                 return false;
327         }
328         return (gg != 0u);
329     }
nonevkt::Reconvergence::__anone030def80111::Ballots330     bool none() const
331     {
332         const uint32_t gg = subgroupCount();
333         for (uint32_t g = 0u; g < gg; ++g)
334         {
335             if (false == at(g).none())
336                 return false;
337         }
338         return (gg != 0u);
339     }
anyvkt::Reconvergence::__anone030def80111::Ballots340     bool any() const
341     {
342         bool res          = false;
343         const uint32_t gg = subgroupCount();
344         for (uint32_t g = 0u; g < gg; ++g)
345             res |= super::at(g).any();
346         return res;
347     }
findBitvkt::Reconvergence::__anone030def80111::Ballots348     static uint32_t findBit(uint32_t otherFullyQualifiedInvocationID, uint32_t otherSubgroupSize)
349     {
350         return (((otherFullyQualifiedInvocationID / otherSubgroupSize) * subgroupInvocationSize) +
351                 (otherFullyQualifiedInvocationID % otherSubgroupSize));
352     }
upcastvkt::Reconvergence::__anone030def80111::Ballots353     inline add_cref<super> upcast(add_cref<Ballots> other) const
354     {
355         return static_cast<add_cref<super>>(other);
356     }
operator &=vkt::Reconvergence::__anone030def80111::Ballots357     add_ref<Ballots> operator&=(add_cref<Ballots> other)
358     {
359         DE_ASSERT(subgroupCount() == other.subgroupCount());
360         const uint32_t gg = subgroupCount();
361         for (uint32_t g = 0u; g < gg; ++g)
362             super::at(g) = super::at(g) & upcast(other).at(g);
363         return *this;
364     }
operator &vkt::Reconvergence::__anone030def80111::Ballots365     Ballots operator&(add_cref<Ballots> other) const
366     {
367         Ballots res(*this);
368         res &= other;
369         return res;
370     }
operator |=vkt::Reconvergence::__anone030def80111::Ballots371     add_ref<Ballots> operator|=(add_cref<Ballots> other)
372     {
373         DE_ASSERT(subgroupCount() == other.subgroupCount());
374         const uint32_t gg = subgroupCount();
375         for (uint32_t g = 0u; g < gg; ++g)
376             super::at(g) = super::at(g) | upcast(other).at(g);
377         return *this;
378     }
operator |vkt::Reconvergence::__anone030def80111::Ballots379     Ballots operator|(add_cref<Ballots> other) const
380     {
381         Ballots res(*this);
382         res |= other;
383         return res;
384     }
operator <<=vkt::Reconvergence::__anone030def80111::Ballots385     add_ref<Ballots> operator<<=(uint32_t bits)
386     {
387         return ((*this) = ((*this) << bits));
388     }
operator <<vkt::Reconvergence::__anone030def80111::Ballots389     Ballots operator<<(uint32_t bits) const
390     {
391         Ballots res(subgroupCount());
392         if (bits < size() && bits != 0u)
393         {
394             for (uint32_t b = 0; b < bits; ++b)
395                 res.set((b + bits), test(b));
396         }
397         return res;
398     }
operator ~vkt::Reconvergence::__anone030def80111::Ballots399     Ballots operator~() const
400     {
401         Ballots res(*this);
402         const uint32_t gg = subgroupCount();
403         for (uint32_t g = 0u; g < gg; ++g)
404             res.at(g) = super::at(g).operator~();
405         return res;
406     }
operator ==vkt::Reconvergence::__anone030def80111::Ballots407     bool operator==(add_cref<Ballots> other) const
408     {
409         if (super::size() == upcast(other).size())
410         {
411             const uint32_t gg = subgroupCount();
412             for (uint32_t g = 0u; g < gg; ++g)
413             {
414                 if (at(g) != other[g])
415                     return false;
416             }
417             return true;
418         }
419         return false;
420     }
operator =vkt::Reconvergence::__anone030def80111::Ballots421     add_ref<Ballots> operator=(add_cref<Ballots> other)
422     {
423         DE_ASSERT((subgroupCount() == other.subgroupCount()));
424         const uint32_t gg = subgroupCount();
425         for (uint32_t g = 0u; g < gg; ++g)
426             at(g) = other.at(g);
427         return *this;
428     }
operator =vkt::Reconvergence::__anone030def80111::Ballots429     add_ref<Ballots> operator=(add_cref<value_type> forAllGroups)
430     {
431         DE_ASSERT(super::size() >= 1u);
432         const uint32_t gg = subgroupCount();
433         for (uint32_t g = 0u; g < gg; ++g)
434             at(g) = forAllGroups;
435         return *this;
436     }
437 };
438 
subgroupSizeToMask(uint32_t subgroupSize)439 uint64_t subgroupSizeToMask(uint32_t subgroupSize)
440 {
441     if (subgroupSize == 64)
442         return ~0ULL;
443     else
444         return (1ULL << subgroupSize) - 1;
445 }
446 
subgroupSizeToMask(uint32_t subgroupSize,uint32_t subgroupCount)447 Ballot subgroupSizeToMask(uint32_t subgroupSize, uint32_t subgroupCount)
448 {
449     DE_UNREF(subgroupCount);
450     Ballot b;
451     DE_ASSERT(subgroupSize <= b.size());
452     for (uint32_t i = 0; i < subgroupSize; ++i)
453         b.set(i);
454     return b;
455 }
456 
457 // Take a 64-bit integer, mask it to the subgroup size, and then
458 // replicate it for each subgroup
bitsetFromU64(uint64_t mask,uint32_t subgroupSize)459 bitset_inv_t bitsetFromU64(uint64_t mask, uint32_t subgroupSize)
460 {
461     mask &= subgroupSizeToMask(subgroupSize);
462     bitset_inv_t result(mask);
463     for (uint32_t i = 0; i < result.size() / subgroupSize - 1; ++i)
464     {
465         result = (result << subgroupSize) | bitset_inv_t(mask);
466     }
467     return result;
468 }
469 
ballotsFromU64(uint64_t maskValue,uint32_t subgroupSize,uint32_t subgroupCount)470 Ballots ballotsFromU64(uint64_t maskValue, uint32_t subgroupSize, uint32_t subgroupCount)
471 {
472     Ballot b(maskValue);
473     b &= subgroupSizeToMask(subgroupSize, subgroupCount);
474     Ballots result(subgroupCount);
475     for (uint32_t g = 0; g < subgroupCount; ++g)
476         result.at(g) = b;
477     return result;
478 }
479 
ballotsFromBallot(Ballot b,uint32_t subgroupSize,uint32_t subgroupCount)480 Ballots ballotsFromBallot(Ballot b, uint32_t subgroupSize, uint32_t subgroupCount)
481 {
482     b &= subgroupSizeToMask(subgroupSize, subgroupCount);
483     Ballots result(subgroupCount);
484     for (uint32_t g = 0; g < subgroupCount; ++g)
485         result.at(g) = b;
486     return result;
487 }
488 
489 // Pick out the mask for the subgroup that invocationID is a member of
bitsetToU64(const bitset_inv_t & bitset,uint32_t subgroupSize,uint32_t invocationID)490 uint64_t bitsetToU64(const bitset_inv_t &bitset, uint32_t subgroupSize, uint32_t invocationID)
491 {
492     bitset_inv_t copy(bitset);
493     copy >>= (invocationID / subgroupSize) * subgroupSize;
494     copy &= bitset_inv_t(subgroupSizeToMask(subgroupSize));
495     uint64_t mask = copy.to_ullong();
496     mask &= subgroupSizeToMask(subgroupSize);
497     return mask;
498 }
499 
500 // Pick out the mask for the subgroup that invocationID is a member of
bitsetToBallot(const Ballots & bitset,uint32_t subgroupSize,uint32_t invocationID)501 Ballot bitsetToBallot(const Ballots &bitset, uint32_t subgroupSize, uint32_t invocationID)
502 {
503     return bitset.at(invocationID / subgroupSize) & subgroupSizeToMask(subgroupSize, bitset.subgroupCount());
504 }
505 
506 // Pick out the mask for the subgroup that invocationID is a member of
bitsetToBallot(add_cref<Ballots> bitset,add_cref<Ballot> subgroupSizeMask,uint32_t subgroupSize,uint32_t invocationID)507 Ballot bitsetToBallot(add_cref<Ballots> bitset, add_cref<Ballot> subgroupSizeMask, uint32_t subgroupSize,
508                       uint32_t invocationID)
509 {
510     return bitset.at(invocationID / subgroupSize) & subgroupSizeMask;
511 }
512 
bitsetToBallot(uint64_t value,uint32_t subgroupCount,uint32_t subgroupSize,uint32_t invocationID)513 Ballot bitsetToBallot(uint64_t value, uint32_t subgroupCount, uint32_t subgroupSize, uint32_t invocationID)
514 {
515     Ballots bs = ballotsFromU64(value, subgroupSize, subgroupCount);
516     return bitsetToBallot(bs, subgroupSize, invocationID);
517 }
518 
findLSB(uint64_t value)519 static int findLSB(uint64_t value)
520 {
521     for (int i = 0; i < 64; i++)
522     {
523         if (value & (1ULL << i))
524             return i;
525     }
526     return -1;
527 }
528 
529 template <uint32_t N>
findLSB(add_cref<std::bitset<N>> value)530 static uint32_t findLSB(add_cref<std::bitset<N>> value)
531 {
532     for (uint32_t i = 0u; i < N; ++i)
533     {
534         if (value.test(i))
535             return i;
536     }
537     return std::numeric_limits<uint32_t>::max();
538 }
539 
540 // For each subgroup, pick out the elected invocationID, and accumulate
541 // a bitset of all of them
bitsetElect(const bitset_inv_t & value,int32_t subgroupSize)542 static bitset_inv_t bitsetElect(const bitset_inv_t &value, int32_t subgroupSize)
543 {
544     bitset_inv_t ret; // zero initialized
545 
546     for (int32_t i = 0; i < (int32_t)value.size(); i += subgroupSize)
547     {
548         uint64_t mask = bitsetToU64(value, subgroupSize, i);
549         int lsb       = findLSB(mask);
550         ret |= bitset_inv_t(lsb == -1 ? 0 : (1ULL << lsb)) << i;
551     }
552     return ret;
553 }
554 
bitsetElect(add_cref<Ballots> value)555 static Ballots bitsetElect(add_cref<Ballots> value)
556 {
557     Ballots ret(value.subgroupCount());
558     for (uint32_t g = 0u; g < value.subgroupCount(); ++g)
559     {
560         const uint32_t lsb = findLSB<Ballots::subgroupInvocationSize>(value.at(g));
561         if (lsb != std::numeric_limits<uint32_t>::max())
562         {
563             ret.at(g).set(lsb);
564         }
565     }
566     return ret;
567 }
568 
569 struct PushConstant
570 {
571     int32_t invocationStride;
572     uint32_t width;
573     uint32_t height;
574     uint32_t primitiveStride;
575     uint32_t subgroupStride;
576     uint32_t enableInvocationIndex;
577 };
578 
579 struct Vertex
580 {
581     // Traditional POD structure that mimics a vertex.
582     // Be carefull before do any changes in this structure
583     // because it is strictly mapped to VK_FORMAT_R32G32B32A32_SFLOAT
584     // when graphics pipeline is constructed.
585     float x, y, z, w;
586 };
587 
588 typedef Vertex Triangle[3];
589 
590 class RandomProgram;
591 class ComputeRandomProgram;
592 
getSubgroupProperties(vkt::Context & context)593 std::pair<vk::VkPhysicalDeviceSubgroupProperties, vk::VkPhysicalDeviceProperties2> getSubgroupProperties(
594     vkt::Context &context)
595 {
596     vk::VkPhysicalDeviceSubgroupProperties subgroupProperties;
597     deMemset(&subgroupProperties, 0, sizeof(subgroupProperties));
598     subgroupProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES;
599 
600     vk::VkPhysicalDeviceProperties2 properties2;
601     deMemset(&properties2, 0, sizeof(properties2));
602     properties2.sType = vk::VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
603     properties2.pNext = &subgroupProperties;
604 
605     context.getInstanceInterface().getPhysicalDeviceProperties2(context.getPhysicalDevice(), &properties2);
606 
607     return {subgroupProperties, properties2};
608 }
609 
610 class ReconvergenceTestInstance : public TestInstance
611 {
612 public:
613     // { vert, frag, tesc, tese, geom }; if any
614     using Shaders = std::vector<Move<VkShaderModule>>;
615 
ReconvergenceTestInstance(Context & context,const CaseDef & data)616     ReconvergenceTestInstance(Context &context, const CaseDef &data)
617         : TestInstance(context)
618         , m_data(data)
619         , m_subgroupSize(getSubgroupProperties(context).first.subgroupSize)
620     {
621     }
622     ~ReconvergenceTestInstance(void) = default;
623 
624     Move<VkPipeline> createComputePipeline(const VkPipelineLayout pipelineLayout, const VkShaderModule computeShader);
625     Move<VkPipeline> createGraphicsPipeline(const VkPipelineLayout pipelineLayout, const VkRenderPass renderPass,
626                                             const uint32_t width, const uint32_t height, const Shaders &shaders,
627                                             const VkPrimitiveTopology topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
628                                             const uint32_t patchControlPoints  = 0u);
629 
630 protected:
631     const CaseDef m_data;
632     const uint32_t m_subgroupSize;
633 };
634 
635 class ReconvergenceTestComputeInstance : public ReconvergenceTestInstance
636 {
637 public:
ReconvergenceTestComputeInstance(Context & context,const CaseDef & data,std::shared_ptr<RandomProgram> program,std::map<uint32_t,uint32_t> && subgroupSizeToMaxLoc)638     ReconvergenceTestComputeInstance(Context &context, const CaseDef &data, std::shared_ptr<RandomProgram> program,
639                                      std::map<uint32_t, uint32_t> &&subgroupSizeToMaxLoc)
640         : ReconvergenceTestInstance(context, data)
641         , m_program(std::static_pointer_cast<ComputeRandomProgram>(program))
642         , m_subgroupSizeToMaxLoc(std::move(subgroupSizeToMaxLoc))
643     {
644     }
645     ~ReconvergenceTestComputeInstance(void) = default;
646 
647     virtual tcu::TestStatus iterate(void) override;
648     qpTestResult_e calculateAndLogResult(const tcu::UVec4 *result, const std::vector<tcu::UVec4> &ref,
649                                          uint32_t invocationStride, uint32_t subgroupSize, uint32_t shaderMaxLoc);
650 
651 private:
652     std::shared_ptr<ComputeRandomProgram> m_program;
653     std::map<uint32_t, uint32_t> m_subgroupSizeToMaxLoc;
654 };
655 
656 class ReconvergenceTestGraphicsInstance : public ReconvergenceTestInstance
657 {
658 public:
ReconvergenceTestGraphicsInstance(Context & context,const CaseDef & data)659     ReconvergenceTestGraphicsInstance(Context &context, const CaseDef &data) : ReconvergenceTestInstance(context, data)
660     {
661     }
662     ~ReconvergenceTestGraphicsInstance(void) = default;
663 
664     auto makeRenderPassBeginInfo(const VkRenderPass renderPass, const VkFramebuffer framebuffer)
665         -> VkRenderPassBeginInfo;
666     virtual auto recordDrawingAndSubmit(const VkCommandBuffer cmdBuffer, const VkPipelineLayout pipelineLayout,
667                                         const VkPipeline pipeline, const VkDescriptorSet descriptorSet,
668                                         const PushConstant &pushConstant, const VkRenderPassBeginInfo &renderPassInfo,
669                                         const VkBuffer vertexBuffer, const uint32_t vertexCount, const VkImage image)
670         -> void;
671     virtual auto generateVertices(const uint32_t primitiveCount, const VkPrimitiveTopology topology,
672                                   const uint32_t patchSize = 1) -> std::vector<tcu::Vec4>;
673     virtual auto createVertexBufferAndFlush(const std::vector<tcu::Vec4> &vertices) -> de::MovePtr<BufferWithMemory>;
674     virtual auto createVertexBufferAndFlush(uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
675         -> de::MovePtr<BufferWithMemory>;
676     virtual auto createShaders(void) -> Shaders = 0;
677 
678     enum PrintMode
679     {
680         None,
681         ThreadsInColumns,
682         OutLocsInColumns,
683         IntuitiveThreadsOutlocs,
684         Console
685     };
686 
687     virtual auto calculateAndLogResult(const uint64_t *result, const std::vector<uint64_t> &ref,
688                                        uint32_t invocationStride, uint32_t subgroupSize, uint32_t shaderMaxLocs,
689                                        uint32_t primitiveCount, PrintMode printMode) -> qpTestResult_e;
690 };
691 
692 class ReconvergenceTestFragmentInstance : public ReconvergenceTestGraphicsInstance
693 {
694     struct Arrangement
695     {
696     };
697     friend class FragmentRandomProgram;
698 
699 public:
ReconvergenceTestFragmentInstance(Context & context,const CaseDef & data)700     ReconvergenceTestFragmentInstance(Context &context, const CaseDef &data)
701         : ReconvergenceTestGraphicsInstance(context, data)
702     {
703     }
704     ~ReconvergenceTestFragmentInstance(void) = default;
705     virtual auto createShaders(void) -> std::vector<Move<VkShaderModule>> override;
706     auto callAuxiliaryShader(tcu::TestStatus &status, uint32_t triangleCount) -> std::vector<uint32_t>;
707     auto makeImageCreateInfo(VkFormat format) const -> VkImageCreateInfo;
708     virtual auto createVertexBufferAndFlush(uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
709         -> de::MovePtr<BufferWithMemory> override;
710     virtual auto iterate(void) -> tcu::TestStatus override;
711     auto calculateAndLogResultEx(tcu::TestLog &log, const tcu::UVec4 *result, const std::vector<tcu::UVec4> &ref,
712                                  const uint32_t maxLoc, const Arrangement &a, const PrintMode printMode)
713         -> qpTestResult_e;
714 };
715 
716 class ReconvergenceTestVertexInstance : public ReconvergenceTestGraphicsInstance
717 {
718 public:
ReconvergenceTestVertexInstance(Context & context,const CaseDef & data)719     ReconvergenceTestVertexInstance(Context &context, const CaseDef &data)
720         : ReconvergenceTestGraphicsInstance(context, data)
721     {
722     }
723     ~ReconvergenceTestVertexInstance(void) = default;
724     virtual auto createShaders(void) -> std::vector<Move<VkShaderModule>> override;
725     virtual auto createVertexBufferAndFlush(uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
726         -> de::MovePtr<BufferWithMemory> override;
727 
728     virtual auto iterate(void) -> tcu::TestStatus override;
729     auto calculateAndLogResultEx(add_ref<tcu::TestLog> log, const tcu::UVec4 *result,
730                                  const std::vector<tcu::UVec4> &ref, const uint32_t maxLoc, const PrintMode printMode)
731         -> qpTestResult_e;
732 };
733 
734 class ReconvergenceTestTessCtrlInstance : public ReconvergenceTestGraphicsInstance
735 {
736 public:
ReconvergenceTestTessCtrlInstance(Context & context,const CaseDef & data)737     ReconvergenceTestTessCtrlInstance(Context &context, const CaseDef &data)
738         : ReconvergenceTestGraphicsInstance(context, data)
739     {
740     }
741     ~ReconvergenceTestTessCtrlInstance(void) = default;
742     virtual auto createShaders(void) -> std::vector<Move<VkShaderModule>> override;
743     virtual auto iterate(void) -> tcu::TestStatus override;
744 };
745 
746 class ReconvergenceTestTessEvalInstance : public ReconvergenceTestGraphicsInstance
747 {
748 public:
ReconvergenceTestTessEvalInstance(Context & context,add_cref<CaseDef> data)749     ReconvergenceTestTessEvalInstance(Context &context, add_cref<CaseDef> data)
750         : ReconvergenceTestGraphicsInstance(context, data)
751     {
752     }
753     ~ReconvergenceTestTessEvalInstance(void) = default;
754     virtual auto createShaders(void) -> std::vector<Move<VkShaderModule>> override;
755     virtual auto iterate(void) -> tcu::TestStatus override;
756 };
757 
758 class ReconvergenceTestGeometryInstance : public ReconvergenceTestGraphicsInstance
759 {
760 public:
ReconvergenceTestGeometryInstance(Context & context,add_cref<CaseDef> data)761     ReconvergenceTestGeometryInstance(Context &context, add_cref<CaseDef> data)
762         : ReconvergenceTestGraphicsInstance(context, data)
763     {
764     }
765     ~ReconvergenceTestGeometryInstance(void) = default;
766     virtual auto createShaders(void) -> std::vector<Move<VkShaderModule>> override;
767     virtual auto createVertexBufferAndFlush(uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
768         -> de::MovePtr<BufferWithMemory> override;
769 
770     virtual auto iterate(void) -> tcu::TestStatus override;
771     auto calculateAndLogResultEx(add_ref<tcu::TestLog> log, const tcu::UVec4 *result,
772                                  const std::vector<tcu::UVec4> &ref, const uint32_t maxLoc, const PrintMode printMode)
773         -> qpTestResult_e;
774 };
775 
createGraphicsPipeline(const VkPipelineLayout pipelineLayout,const VkRenderPass renderPass,const uint32_t width,const uint32_t height,const Shaders & shaders,const VkPrimitiveTopology topology,const uint32_t patchControlPoints)776 Move<VkPipeline> ReconvergenceTestInstance::createGraphicsPipeline(const VkPipelineLayout pipelineLayout,
777                                                                    const VkRenderPass renderPass, const uint32_t width,
778                                                                    const uint32_t height, const Shaders &shaders,
779                                                                    const VkPrimitiveTopology topology,
780                                                                    const uint32_t patchControlPoints)
781 {
782     const DeviceInterface &vkd = m_context.getDeviceInterface();
783     const VkDevice device      = m_context.getDevice();
784     const uint32_t subpass     = 0;
785 
786     const std::vector<VkViewport> viewports{makeViewport(width, height)};
787     const std::vector<VkRect2D> scissors{makeRect2D(width, height)};
788 
789     enum ShaderIndex
790     {
791         IVERT = 0,
792         IFRAG,
793         ITESC,
794         ITESE,
795         IGEOM
796     };
797     VkShaderModule handles[5] = {VK_NULL_HANDLE, VK_NULL_HANDLE, VK_NULL_HANDLE, VK_NULL_HANDLE,
798                                  VK_NULL_HANDLE}; // { vert, frag, tesc, tese, geom }
799 
800     for (uint32_t i = 0; i < (uint32_t)ARRAYSIZE(handles); ++i)
801     {
802         handles[i] = (i < (uint32_t)shaders.size()) ? *shaders[i] : VK_NULL_HANDLE;
803     }
804 
805     return makeGraphicsPipeline(vkd, device, pipelineLayout, handles[IVERT], handles[ITESC], handles[ITESE],
806                                 handles[IGEOM], handles[IFRAG], renderPass, viewports, scissors, topology, subpass,
807                                 patchControlPoints);
808 }
809 
createComputePipeline(const VkPipelineLayout pipelineLayout,const VkShaderModule computeShader)810 Move<VkPipeline> ReconvergenceTestInstance::createComputePipeline(const VkPipelineLayout pipelineLayout,
811                                                                   const VkShaderModule computeShader)
812 {
813     const DeviceInterface &vk = m_context.getDeviceInterface();
814     const VkDevice device     = m_context.getDevice();
815 
816     const uint32_t specData[2]                                               = {m_data.sizeX, m_data.sizeY};
817     const vk::VkSpecializationMapEntry entries[DE_LENGTH_OF_ARRAY(specData)] = {
818         {0, (uint32_t)(sizeof(uint32_t) * 0), sizeof(uint32_t)},
819         {1, (uint32_t)(sizeof(uint32_t) * 1), sizeof(uint32_t)},
820     };
821     const vk::VkSpecializationInfo specInfo = {
822         DE_LENGTH_OF_ARRAY(entries), // mapEntryCount
823         entries,                     // pMapEntries
824         sizeof(specData),            // dataSize
825         specData                     // pData
826     };
827 
828     const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT subgroupSizeCreateInfo = {
829         VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT, // VkStructureType sType;
830         nullptr,                                                                        // void* pNext;
831         m_subgroupSize // uint32_t requiredSubgroupSize;
832     };
833 
834     const VkBool32 computeFullSubgroups =
835         m_subgroupSize <= 64 && m_context.getSubgroupSizeControlFeatures().computeFullSubgroups;
836 
837     const void *shaderPNext = computeFullSubgroups ? &subgroupSizeCreateInfo : nullptr;
838     VkPipelineShaderStageCreateFlags pipelineShaderStageCreateFlags =
839         (VkPipelineShaderStageCreateFlags)(computeFullSubgroups ?
840                                                VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT :
841                                                0);
842 
843     const VkPipelineShaderStageCreateInfo shaderCreateInfo = {
844         VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
845         shaderPNext,
846         pipelineShaderStageCreateFlags,
847         VK_SHADER_STAGE_COMPUTE_BIT, // stage
848         computeShader,               // shader
849         "main",
850         &specInfo, // pSpecializationInfo
851     };
852 
853     const VkComputePipelineCreateInfo pipelineCreateInfo = {
854         VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
855         nullptr,
856         0u,               // flags
857         shaderCreateInfo, // cs
858         pipelineLayout,   // layout
859         VK_NULL_HANDLE,   // basePipelineHandle
860         0u,               // basePipelineIndex
861     };
862 
863     return vk::createComputePipeline(vk, device, VK_NULL_HANDLE, &pipelineCreateInfo, NULL);
864 }
865 
866 typedef enum
867 {
868     // store subgroupBallot().
869     // For OP_BALLOT, OP::caseValue is initialized to zero, and then
870     // set to 1 by simulate if the ballot is not workgroup- (or subgroup-_uniform.
871     // Only workgroup-uniform ballots are validated for correctness in
872     // WUCF modes.
873     OP_BALLOT,
874 
875     // store literal constant
876     OP_STORE,
877 
878     // if ((1ULL << gl_SubgroupInvocationID) & mask).
879     // Special case if mask = ~0ULL, converted into "if (inputA.a[idx] == idx)"
880     OP_IF_MASK,
881     OP_ELSE_MASK,
882     OP_ENDIF,
883 
884     // if (gl_SubgroupInvocationID == loopIdxN) (where N is most nested loop counter)
885     OP_IF_LOOPCOUNT,
886     OP_ELSE_LOOPCOUNT,
887 
888     // if (gl_LocalInvocationIndex >= inputA.a[N]) (where N is most nested loop counter)
889     OP_IF_LOCAL_INVOCATION_INDEX,
890     OP_ELSE_LOCAL_INVOCATION_INDEX,
891 
892     // break/continue
893     OP_BREAK,
894     OP_CONTINUE,
895 
896     // if (subgroupElect())
897     OP_ELECT,
898 
899     // Loop with uniform number of iterations (read from a buffer)
900     OP_BEGIN_FOR_UNIF,
901     OP_END_FOR_UNIF,
902 
903     // for (int loopIdxN = 0; loopIdxN < gl_SubgroupInvocationID + 1; ++loopIdxN)
904     OP_BEGIN_FOR_VAR,
905     OP_END_FOR_VAR,
906 
907     // for (int loopIdxN = 0;; ++loopIdxN, OP_BALLOT)
908     // Always has an "if (subgroupElect()) break;" inside.
909     // Does the equivalent of OP_BALLOT in the continue construct
910     OP_BEGIN_FOR_INF,
911     OP_END_FOR_INF,
912 
913     // do { loopIdxN++; ... } while (loopIdxN < uniformValue);
914     OP_BEGIN_DO_WHILE_UNIF,
915     OP_END_DO_WHILE_UNIF,
916 
917     // do { ... } while (true);
918     // Always has an "if (subgroupElect()) break;" inside
919     OP_BEGIN_DO_WHILE_INF,
920     OP_END_DO_WHILE_INF,
921 
922     // return;
923     OP_RETURN,
924 
925     // function call (code bracketed by these is extracted into a separate function)
926     OP_CALL_BEGIN,
927     OP_CALL_END,
928 
929     // switch statement on uniform value
930     OP_SWITCH_UNIF_BEGIN,
931     // switch statement on gl_SubgroupInvocationID & 3 value
932     OP_SWITCH_VAR_BEGIN,
933     // switch statement on loopIdx value
934     OP_SWITCH_LOOP_COUNT_BEGIN,
935 
936     // case statement with a (invocation mask, case mask) pair
937     OP_CASE_MASK_BEGIN,
938     // case statement used for loop counter switches, with a value and a mask of loop iterations
939     OP_CASE_LOOP_COUNT_BEGIN,
940 
941     // end of switch/case statement
942     OP_SWITCH_END,
943     OP_CASE_END,
944 
945     // Extra code with no functional effect. Currently inculdes:
946     // - value 0: while (!subgroupElect()) {}
947     // - value 1: if (condition_that_is_false) { infinite loop }
948     OP_NOISE,
949 
950     // do nothing, only markup
951     OP_NOP
952 } OPType;
953 
OPtypeToStr(const OPType op)954 const char *OPtypeToStr(const OPType op)
955 {
956 #define MAKETEXT(s__) #s__
957 #define CASETEXT(e__) \
958     case e__:         \
959         return MAKETEXT(e__)
960     switch (op)
961     {
962         CASETEXT(OP_BALLOT);
963         CASETEXT(OP_STORE);
964         CASETEXT(OP_IF_MASK);
965         CASETEXT(OP_ELSE_MASK);
966         CASETEXT(OP_ENDIF);
967         CASETEXT(OP_IF_LOOPCOUNT);
968         CASETEXT(OP_ELSE_LOOPCOUNT);
969         CASETEXT(OP_IF_LOCAL_INVOCATION_INDEX);
970         CASETEXT(OP_ELSE_LOCAL_INVOCATION_INDEX);
971         CASETEXT(OP_BREAK);
972         CASETEXT(OP_CONTINUE);
973         CASETEXT(OP_ELECT);
974         CASETEXT(OP_BEGIN_FOR_UNIF);
975         CASETEXT(OP_END_FOR_UNIF);
976         CASETEXT(OP_BEGIN_FOR_VAR);
977         CASETEXT(OP_END_FOR_VAR);
978         CASETEXT(OP_BEGIN_FOR_INF);
979         CASETEXT(OP_END_FOR_INF);
980         CASETEXT(OP_BEGIN_DO_WHILE_UNIF);
981         CASETEXT(OP_END_DO_WHILE_UNIF);
982         CASETEXT(OP_BEGIN_DO_WHILE_INF);
983         CASETEXT(OP_END_DO_WHILE_INF);
984         CASETEXT(OP_RETURN);
985         CASETEXT(OP_CALL_BEGIN);
986         CASETEXT(OP_CALL_END);
987         CASETEXT(OP_SWITCH_UNIF_BEGIN);
988         CASETEXT(OP_SWITCH_VAR_BEGIN);
989         CASETEXT(OP_SWITCH_LOOP_COUNT_BEGIN);
990         CASETEXT(OP_CASE_MASK_BEGIN);
991         CASETEXT(OP_CASE_LOOP_COUNT_BEGIN);
992         CASETEXT(OP_SWITCH_END);
993         CASETEXT(OP_CASE_END);
994         CASETEXT(OP_NOISE);
995         CASETEXT(OP_NOP);
996     }
997     return "<Unknown>";
998 }
999 
1000 typedef enum
1001 {
1002     // Different if test conditions
1003     IF_MASK,
1004     IF_UNIFORM,
1005     IF_LOOPCOUNT,
1006     IF_LOCAL_INVOCATION_INDEX,
1007 } IFType;
1008 
1009 class OP
1010 {
1011 public:
OP(OPType _type,uint64_t _value,uint32_t _caseValue=0)1012     OP(OPType _type, uint64_t _value, uint32_t _caseValue = 0)
1013         : type(_type)
1014         , value(_value)
1015         // by default, initialized only lower part with a repetition of _value
1016         , bvalue(tcu::UVec4(uint32_t(_value), uint32_t(_value >> 32), uint32_t(_value), uint32_t(_value >> 32)))
1017         , caseValue(_caseValue)
1018     {
1019     }
1020 
1021     // The type of operation and an optional value.
1022     // The value could be a mask for an if test, the index of the loop
1023     // header for an end of loop, or the constant value for a store instruction
1024     OPType type;
1025     uint64_t value;
1026     Ballot bvalue;
1027     uint32_t caseValue;
1028 };
1029 
1030 class RandomProgram
1031 {
1032 
1033 public:
RandomProgram(const CaseDef & c,uint32_t invocationCount=0u)1034     RandomProgram(const CaseDef &c, uint32_t invocationCount = 0u)
1035         : caseDef(c)
1036         , invocationStride(invocationCount ? invocationCount : (c.sizeX * c.sizeY))
1037         , rnd()
1038         , ops()
1039         , masks()
1040         , ballotMasks()
1041         , numMasks(5)
1042         , nesting(0)
1043         , maxNesting(c.maxNesting)
1044         , loopNesting(0)
1045         , loopNestingThisFunction(0)
1046         , callNesting(0)
1047         , minCount(30)
1048         , indent(0)
1049         , isLoopInf(100, false)
1050         , doneInfLoopBreak(100, false)
1051         , storeBase(0x10000)
1052     {
1053         deRandom_init(&rnd, caseDef.seed);
1054         for (int i = 0; i < numMasks; ++i)
1055         {
1056             const uint64_t lo = deRandom_getUint64(&rnd);
1057             const uint64_t hi = deRandom_getUint64(&rnd);
1058             const tcu::UVec4 v4(uint32_t(lo), uint32_t(lo >> 32), uint32_t(hi), uint32_t(hi >> 32));
1059             ballotMasks.emplace_back(v4);
1060             masks.push_back(lo);
1061         }
1062     }
1063     virtual ~RandomProgram() = default;
1064 
1065     const CaseDef caseDef;
1066     const uint32_t invocationStride;
1067     deRandom rnd;
1068     vector<OP> ops;
1069     vector<uint64_t> masks;
1070     vector<Ballot> ballotMasks;
1071     int32_t numMasks;
1072     int32_t nesting;
1073     int32_t maxNesting;
1074     int32_t loopNesting;
1075     int32_t loopNestingThisFunction;
1076     int32_t callNesting;
1077     int32_t minCount;
1078     int32_t indent;
1079     vector<bool> isLoopInf;
1080     vector<bool> doneInfLoopBreak;
1081     // Offset the value we use for OP_STORE, to avoid colliding with fully converged
1082     // active masks with small subgroup sizes (e.g. with subgroupSize == 4, the SUCF
1083     // tests need to know that 0xF is really an active mask).
1084     int32_t storeBase;
1085 
genIf(IFType ifType,uint32_t maxLocalIndexCmp=0u)1086     virtual void genIf(IFType ifType, uint32_t maxLocalIndexCmp = 0u)
1087     {
1088         uint32_t maskIdx = deRandom_getUint32(&rnd) % numMasks;
1089         uint64_t mask    = masks[maskIdx];
1090         Ballot bmask     = ballotMasks[maskIdx];
1091         if (ifType == IF_UNIFORM)
1092         {
1093             mask = ~0ULL;
1094             bmask.set();
1095         }
1096 
1097         uint32_t localIndexCmp = deRandom_getUint32(&rnd) % (maxLocalIndexCmp ? maxLocalIndexCmp : invocationStride);
1098         if (ifType == IF_LOCAL_INVOCATION_INDEX)
1099             ops.push_back({OP_IF_LOCAL_INVOCATION_INDEX, localIndexCmp});
1100         else if (ifType == IF_LOOPCOUNT)
1101             ops.push_back({OP_IF_LOOPCOUNT, 0});
1102         else
1103         {
1104             ops.push_back({OP_IF_MASK, mask});
1105             ops.back().bvalue = bmask;
1106         }
1107 
1108         nesting++;
1109 
1110         size_t thenBegin = ops.size();
1111         pickOP(2);
1112         size_t thenEnd = ops.size();
1113 
1114         uint32_t randElse = (deRandom_getUint32(&rnd) % 100);
1115         if (randElse < 50)
1116         {
1117             if (ifType == IF_LOCAL_INVOCATION_INDEX)
1118                 ops.push_back({OP_ELSE_LOCAL_INVOCATION_INDEX, localIndexCmp});
1119             else if (ifType == IF_LOOPCOUNT)
1120                 ops.push_back({OP_ELSE_LOOPCOUNT, 0});
1121             else
1122                 ops.push_back({OP_ELSE_MASK, 0});
1123 
1124             if (randElse < 10)
1125             {
1126                 // Sometimes make the else block identical to the then block
1127                 for (size_t i = thenBegin; i < thenEnd; ++i)
1128                     ops.push_back(ops[i]);
1129             }
1130             else
1131                 pickOP(2);
1132         }
1133         ops.push_back({OP_ENDIF, 0});
1134         nesting--;
1135     }
1136 
genForUnif()1137     void genForUnif()
1138     {
1139         uint32_t iterCount = (deRandom_getUint32(&rnd) % 5) + 1;
1140         ops.push_back({OP_BEGIN_FOR_UNIF, iterCount});
1141         uint32_t loopheader = (uint32_t)ops.size() - 1;
1142         nesting++;
1143         loopNesting++;
1144         loopNestingThisFunction++;
1145         pickOP(2);
1146         ops.push_back({OP_END_FOR_UNIF, loopheader});
1147         loopNestingThisFunction--;
1148         loopNesting--;
1149         nesting--;
1150     }
1151 
genDoWhileUnif()1152     void genDoWhileUnif()
1153     {
1154         uint32_t iterCount = (deRandom_getUint32(&rnd) % 5) + 1;
1155         ops.push_back({OP_BEGIN_DO_WHILE_UNIF, iterCount});
1156         uint32_t loopheader = (uint32_t)ops.size() - 1;
1157         nesting++;
1158         loopNesting++;
1159         loopNestingThisFunction++;
1160         pickOP(2);
1161         ops.push_back({OP_END_DO_WHILE_UNIF, loopheader});
1162         loopNestingThisFunction--;
1163         loopNesting--;
1164         nesting--;
1165     }
1166 
genForVar()1167     void genForVar()
1168     {
1169         ops.push_back({OP_BEGIN_FOR_VAR, 0});
1170         uint32_t loopheader = (uint32_t)ops.size() - 1;
1171         nesting++;
1172         loopNesting++;
1173         loopNestingThisFunction++;
1174         pickOP(2);
1175         ops.push_back({OP_END_FOR_VAR, loopheader});
1176         loopNestingThisFunction--;
1177         loopNesting--;
1178         nesting--;
1179     }
1180 
genForInf()1181     void genForInf()
1182     {
1183         ops.push_back({OP_BEGIN_FOR_INF, 0});
1184         uint32_t loopheader = (uint32_t)ops.size() - 1;
1185 
1186         nesting++;
1187         loopNesting++;
1188         loopNestingThisFunction++;
1189         isLoopInf[loopNesting]        = true;
1190         doneInfLoopBreak[loopNesting] = false;
1191 
1192         pickOP(2);
1193 
1194         genElect(true);
1195         doneInfLoopBreak[loopNesting] = true;
1196 
1197         pickOP(2);
1198 
1199         ops.push_back({OP_END_FOR_INF, loopheader});
1200 
1201         isLoopInf[loopNesting]        = false;
1202         doneInfLoopBreak[loopNesting] = false;
1203         loopNestingThisFunction--;
1204         loopNesting--;
1205         nesting--;
1206     }
1207 
genDoWhileInf()1208     void genDoWhileInf()
1209     {
1210         ops.push_back({OP_BEGIN_DO_WHILE_INF, 0});
1211         uint32_t loopheader = (uint32_t)ops.size() - 1;
1212 
1213         nesting++;
1214         loopNesting++;
1215         loopNestingThisFunction++;
1216         isLoopInf[loopNesting]        = true;
1217         doneInfLoopBreak[loopNesting] = false;
1218 
1219         pickOP(2);
1220 
1221         genElect(true);
1222         doneInfLoopBreak[loopNesting] = true;
1223 
1224         pickOP(2);
1225 
1226         ops.push_back({OP_END_DO_WHILE_INF, loopheader});
1227 
1228         isLoopInf[loopNesting]        = false;
1229         doneInfLoopBreak[loopNesting] = false;
1230         loopNestingThisFunction--;
1231         loopNesting--;
1232         nesting--;
1233     }
1234 
genBreak()1235     void genBreak()
1236     {
1237         if (loopNestingThisFunction > 0)
1238         {
1239             // Sometimes put the break in a divergent if
1240             if ((deRandom_getUint32(&rnd) % 100) < 10)
1241             {
1242                 ops.push_back({OP_IF_MASK, masks[0]});
1243                 ops.back().bvalue = ballotMasks[0];
1244                 ops.push_back({OP_BREAK, 0});
1245                 ops.push_back({OP_ELSE_MASK, 0});
1246                 ops.push_back({OP_BREAK, 0});
1247                 ops.push_back({OP_ENDIF, 0});
1248             }
1249             else
1250                 ops.push_back({OP_BREAK, 0});
1251         }
1252     }
1253 
genContinue()1254     void genContinue()
1255     {
1256         // continues are allowed if we're in a loop and the loop is not infinite,
1257         // or if it is infinite and we've already done a subgroupElect+break.
1258         // However, adding more continues seems to reduce the failure rate, so
1259         // disable it for now
1260         if (loopNestingThisFunction > 0 && !(isLoopInf[loopNesting] /*&& !doneInfLoopBreak[loopNesting]*/))
1261         {
1262             // Sometimes put the continue in a divergent if
1263             if ((deRandom_getUint32(&rnd) % 100) < 10)
1264             {
1265                 ops.push_back({OP_IF_MASK, masks[0]});
1266                 ops.back().bvalue = ballotMasks[0];
1267                 ops.push_back({OP_CONTINUE, 0});
1268                 ops.push_back({OP_ELSE_MASK, 0});
1269                 ops.push_back({OP_CONTINUE, 0});
1270                 ops.push_back({OP_ENDIF, 0});
1271             }
1272             else
1273                 ops.push_back({OP_CONTINUE, 0});
1274         }
1275     }
1276 
1277     // doBreak is used to generate "if (subgroupElect()) { ... break; }" inside infinite loops
genElect(bool doBreak)1278     void genElect(bool doBreak)
1279     {
1280         ops.push_back({OP_ELECT, 0});
1281         nesting++;
1282         if (doBreak)
1283         {
1284             // Put something interestign before the break
1285             genBallot();
1286             genBallot();
1287             if ((deRandom_getUint32(&rnd) % 100) < 10)
1288                 pickOP(1);
1289 
1290             // if we're in a function, sometimes  use return instead
1291             if (callNesting > 0 && (deRandom_getUint32(&rnd) % 100) < 30)
1292                 ops.push_back({OP_RETURN, 0});
1293             else
1294                 genBreak();
1295         }
1296         else
1297             pickOP(2);
1298 
1299         ops.push_back({OP_ENDIF, 0});
1300         nesting--;
1301     }
1302 
genReturn()1303     void genReturn()
1304     {
1305         uint32_t r = deRandom_getUint32(&rnd) % 100;
1306         if (nesting > 0 &&
1307             // Use return rarely in main, 20% of the time in a singly nested loop in a function
1308             // and 50% of the time in a multiply nested loop in a function
1309             (r < 5 || (callNesting > 0 && loopNestingThisFunction > 0 && r < 20) ||
1310              (callNesting > 0 && loopNestingThisFunction > 1 && r < 50)))
1311         {
1312             genBallot();
1313             if ((deRandom_getUint32(&rnd) % 100) < 10)
1314             {
1315                 ops.push_back({OP_IF_MASK, masks[0]});
1316                 ops.back().bvalue = ballotMasks[0];
1317                 ops.push_back({OP_RETURN, 0});
1318                 ops.push_back({OP_ELSE_MASK, 0});
1319                 ops.push_back({OP_RETURN, 0});
1320                 ops.push_back({OP_ENDIF, 0});
1321             }
1322             else
1323                 ops.push_back({OP_RETURN, 0});
1324         }
1325     }
1326 
1327     // Generate a function call. Save and restore some loop information, which is used to
1328     // determine when it's safe to use break/continue
genCall()1329     void genCall()
1330     {
1331         ops.push_back({OP_CALL_BEGIN, 0});
1332         callNesting++;
1333         nesting++;
1334         int32_t saveLoopNestingThisFunction = loopNestingThisFunction;
1335         loopNestingThisFunction             = 0;
1336 
1337         pickOP(2);
1338 
1339         loopNestingThisFunction = saveLoopNestingThisFunction;
1340         nesting--;
1341         callNesting--;
1342         ops.push_back({OP_CALL_END, 0});
1343     }
1344 
1345     // Generate switch on a uniform value:
1346     // switch (inputA.a[r]) {
1347     // case r+1: ... break; // should not execute
1348     // case r:   ... break; // should branch uniformly
1349     // case r+2: ... break; // should not execute
1350     // }
genSwitchUnif()1351     void genSwitchUnif()
1352     {
1353         uint32_t r = deRandom_getUint32(&rnd) % 5;
1354         ops.push_back({OP_SWITCH_UNIF_BEGIN, r});
1355         nesting++;
1356 
1357         ops.push_back({OP_CASE_MASK_BEGIN, 0, 1u << (r + 1)});
1358         pickOP(1);
1359         ops.push_back({OP_CASE_END, 0});
1360 
1361         ops.push_back({OP_CASE_MASK_BEGIN, ~0ULL, 1u << r});
1362         ops.back().bvalue.set();
1363         pickOP(2);
1364         ops.push_back({OP_CASE_END, 0});
1365 
1366         ops.push_back({OP_CASE_MASK_BEGIN, 0, 1u << (r + 2)});
1367         pickOP(1);
1368         ops.push_back({OP_CASE_END, 0});
1369 
1370         ops.push_back({OP_SWITCH_END, 0});
1371         nesting--;
1372     }
1373 
1374     // switch (gl_SubgroupInvocationID & 3) with four unique targets
genSwitchVar()1375     void genSwitchVar()
1376     {
1377         ops.push_back({OP_SWITCH_VAR_BEGIN, 0});
1378         nesting++;
1379 
1380         ops.push_back({OP_CASE_MASK_BEGIN, 0x1111111111111111ULL, 1 << 0});
1381         ops.back().bvalue = tcu::UVec4(0x11111111);
1382         pickOP(1);
1383         ops.push_back({OP_CASE_END, 0});
1384 
1385         ops.push_back({OP_CASE_MASK_BEGIN, 0x2222222222222222ULL, 1 << 1});
1386         ops.back().bvalue = tcu::UVec4(0x22222222);
1387         pickOP(1);
1388         ops.push_back({OP_CASE_END, 0});
1389 
1390         ops.push_back({OP_CASE_MASK_BEGIN, 0x4444444444444444ULL, 1 << 2});
1391         ops.back().bvalue = tcu::UVec4(0x44444444);
1392         pickOP(1);
1393         ops.push_back({OP_CASE_END, 0});
1394 
1395         ops.push_back({OP_CASE_MASK_BEGIN, 0x8888888888888888ULL, 1 << 3});
1396         ops.back().bvalue = tcu::UVec4(0x88888888);
1397         pickOP(1);
1398         ops.push_back({OP_CASE_END, 0});
1399 
1400         ops.push_back({OP_SWITCH_END, 0});
1401         nesting--;
1402     }
1403 
1404     // switch (gl_SubgroupInvocationID & 3) with two shared targets.
1405     // XXX TODO: The test considers these two targets to remain converged,
1406     // though we haven't agreed to that behavior yet.
genSwitchMulticase()1407     void genSwitchMulticase()
1408     {
1409         ops.push_back({OP_SWITCH_VAR_BEGIN, 0});
1410         nesting++;
1411 
1412         ops.push_back({OP_CASE_MASK_BEGIN, 0x3333333333333333ULL, (1 << 0) | (1 << 1)});
1413         ops.back().bvalue = tcu::UVec4(0x33333333);
1414         pickOP(2);
1415         ops.push_back({OP_CASE_END, 0});
1416 
1417         ops.push_back({OP_CASE_MASK_BEGIN, 0xCCCCCCCCCCCCCCCCULL, (1 << 2) | (1 << 3)});
1418         ops.back().bvalue = tcu::UVec4(0xCCCCCCCC);
1419         pickOP(2);
1420         ops.push_back({OP_CASE_END, 0});
1421 
1422         ops.push_back({OP_SWITCH_END, 0});
1423         nesting--;
1424     }
1425 
1426     // switch (loopIdxN) {
1427     // case 1:  ... break;
1428     // case 2:  ... break;
1429     // default: ... break;
1430     // }
genSwitchLoopCount()1431     void genSwitchLoopCount()
1432     {
1433         uint32_t r = deRandom_getUint32(&rnd) % loopNesting;
1434         ops.push_back({OP_SWITCH_LOOP_COUNT_BEGIN, r});
1435         nesting++;
1436 
1437         ops.push_back({OP_CASE_LOOP_COUNT_BEGIN, 1ULL << 1, 1});
1438         ops.back().bvalue = tcu::UVec4(1 << 1, 0, 0, 0);
1439         pickOP(1);
1440         ops.push_back({OP_CASE_END, 0});
1441 
1442         ops.push_back({OP_CASE_LOOP_COUNT_BEGIN, 1ULL << 2, 2});
1443         ops.back().bvalue = tcu::UVec4(1 << 2, 0, 0, 0);
1444         pickOP(1);
1445         ops.push_back({OP_CASE_END, 0});
1446 
1447         // default:
1448         ops.push_back({OP_CASE_LOOP_COUNT_BEGIN, ~6ULL, 0xFFFFFFFF});
1449         ops.back().bvalue = tcu::UVec4(~6u, ~0u, ~0u, ~0u);
1450         pickOP(1);
1451         ops.push_back({OP_CASE_END, 0});
1452 
1453         ops.push_back({OP_SWITCH_END, 0});
1454         nesting--;
1455     }
1456 
pickOP(uint32_t count)1457     void pickOP(uint32_t count)
1458     {
1459         // Pick "count" instructions. These can recursively insert more instructions,
1460         // so "count" is just a seed
1461         for (uint32_t i = 0; i < count; ++i)
1462         {
1463             genBallot();
1464             if (nesting < maxNesting)
1465             {
1466                 uint32_t r = deRandom_getUint32(&rnd) % 11;
1467                 switch (r)
1468                 {
1469                 default:
1470                     DE_ASSERT(0);
1471                     // fallthrough
1472                 case 2:
1473                     if (loopNesting)
1474                     {
1475                         genIf(IF_LOOPCOUNT);
1476                         break;
1477                     }
1478                     // fallthrough
1479                 case 10:
1480                     genIf(IF_LOCAL_INVOCATION_INDEX);
1481                     break;
1482                 case 0:
1483                     genIf(IF_MASK);
1484                     break;
1485                 case 1:
1486                     genIf(IF_UNIFORM);
1487                     break;
1488                 case 3:
1489                 {
1490                     // don't nest loops too deeply, to avoid extreme memory usage or timeouts
1491                     if (loopNesting <= 3)
1492                     {
1493                         uint32_t r2 = deRandom_getUint32(&rnd) % 3;
1494                         switch (r2)
1495                         {
1496                         default:
1497                             DE_ASSERT(0); // fallthrough
1498                         case 0:
1499                             genForUnif();
1500                             break;
1501                         case 1:
1502                             genForInf();
1503                             break;
1504                         case 2:
1505                             genForVar();
1506                             break;
1507                         }
1508                     }
1509                 }
1510                 break;
1511                 case 4:
1512                     genBreak();
1513                     break;
1514                 case 5:
1515                     genContinue();
1516                     break;
1517                 case 6:
1518                     genElect(false);
1519                     break;
1520                 case 7:
1521                 {
1522                     uint32_t r2 = deRandom_getUint32(&rnd) % 5;
1523                     if (r2 == 0 && callNesting == 0 && nesting < maxNesting - 2)
1524                         genCall();
1525                     else
1526                         genReturn();
1527                     break;
1528                 }
1529                 case 8:
1530                 {
1531                     // don't nest loops too deeply, to avoid extreme memory usage or timeouts
1532                     if (loopNesting <= 3)
1533                     {
1534                         uint32_t r2 = deRandom_getUint32(&rnd) % 2;
1535                         switch (r2)
1536                         {
1537                         default:
1538                             DE_ASSERT(0); // fallthrough
1539                         case 0:
1540                             genDoWhileUnif();
1541                             break;
1542                         case 1:
1543                             genDoWhileInf();
1544                             break;
1545                         }
1546                     }
1547                 }
1548                 break;
1549                 case 9:
1550                 {
1551                     uint32_t r2 = deRandom_getUint32(&rnd) % 4;
1552                     switch (r2)
1553                     {
1554                     default:
1555                         DE_ASSERT(0);
1556                         // fallthrough
1557                     case 0:
1558                         genSwitchUnif();
1559                         break;
1560                     case 1:
1561                         if (loopNesting > 0)
1562                         {
1563                             genSwitchLoopCount();
1564                             break;
1565                         }
1566                         // fallthrough
1567                     case 2:
1568                         if (caseDef.testType != TT_MAXIMAL)
1569                         {
1570                             // multicase doesn't have fully-defined behavior for MAXIMAL tests,
1571                             // but does for SUCF tests
1572                             genSwitchMulticase();
1573                             break;
1574                         }
1575                         // fallthrough
1576                     case 3:
1577                         genSwitchVar();
1578                         break;
1579                     }
1580                 }
1581                 break;
1582                 }
1583             }
1584             genBallot();
1585         }
1586     }
1587 
genBallot()1588     void genBallot()
1589     {
1590         // optionally insert ballots, stores, and noise. Ballots and stores are used to determine
1591         // correctness.
1592         if ((deRandom_getUint32(&rnd) % 100) < 20)
1593         {
1594             if (ops.size() < 2 || !(ops[ops.size() - 1].type == OP_BALLOT ||
1595                                     (ops[ops.size() - 1].type == OP_STORE && ops[ops.size() - 2].type == OP_BALLOT)))
1596             {
1597                 // do a store along with each ballot, so we can correlate where
1598                 // the ballot came from
1599                 if (caseDef.testType != TT_MAXIMAL)
1600                     ops.push_back({OP_STORE, (uint32_t)ops.size() + storeBase});
1601                 ops.push_back({OP_BALLOT, 0});
1602             }
1603         }
1604 
1605         if ((deRandom_getUint32(&rnd) % 100) < 10)
1606         {
1607             if (ops.size() < 2 || !(ops[ops.size() - 1].type == OP_STORE ||
1608                                     (ops[ops.size() - 1].type == OP_BALLOT && ops[ops.size() - 2].type == OP_STORE)))
1609             {
1610                 // SUCF does a store with every ballot. Don't bloat the code by adding more.
1611                 if (caseDef.testType == TT_MAXIMAL)
1612                     ops.push_back({OP_STORE, (uint32_t)ops.size() + storeBase});
1613             }
1614         }
1615 
1616         uint32_t r = deRandom_getUint32(&rnd) % 10000;
1617         if (r < 3)
1618             ops.push_back({OP_NOISE, 0});
1619         else if (r < 10)
1620             ops.push_back({OP_NOISE, 1});
1621     }
1622 
generateRandomProgram(qpWatchDog * watchDog,add_ref<tcu::TestLog> log)1623     std::map<uint32_t, uint32_t> generateRandomProgram(qpWatchDog *watchDog, add_ref<tcu::TestLog> log)
1624     {
1625         std::vector<tcu::UVec4> ref;
1626         std::map<uint32_t, uint32_t> subgroupSizeToMaxLoc;
1627 
1628         do
1629         {
1630             ops.clear();
1631             while ((int32_t)ops.size() < minCount)
1632                 pickOP(1);
1633 
1634             // Retry until the program has some UCF results in it
1635             if (caseDef.isUCF())
1636             {
1637                 // Simulate for all subgroup sizes, to determine whether OP_BALLOTs are nonuniform
1638                 for (int32_t subgroupSize = 4; subgroupSize <= 128; subgroupSize *= 2)
1639                 {
1640                     //simulate(true, subgroupSize, ref);
1641                     const uint32_t maxLoc = execute(watchDog, true, subgroupSize, 0u, invocationStride, ref, log);
1642                     subgroupSizeToMaxLoc[subgroupSize] = maxLoc;
1643                 }
1644             }
1645         } while (caseDef.isUCF() && !hasUCF());
1646 
1647         return subgroupSizeToMaxLoc;
1648     }
1649 
printIndent(std::stringstream & css)1650     void printIndent(std::stringstream &css)
1651     {
1652         for (int32_t i = 0; i < indent; ++i)
1653             css << " ";
1654     }
1655 
1656     struct FlowState
1657     {
1658         add_cref<vector<OP>> ops;
1659         const int32_t opsIndex;
1660         const int32_t loopNesting;
1661         const int funcNum;
1662     };
1663 
1664     // State of the subgroup at each level of nesting
1665     struct SubgroupState
1666     {
1667         // Currently executing
1668         bitset_inv_t activeMask;
1669         // Have executed a continue instruction in this loop
1670         bitset_inv_t continueMask;
1671         // index of the current if test or loop header
1672         uint32_t header;
1673         // number of loop iterations performed
1674         uint32_t tripCount;
1675         // is this nesting a loop?
1676         uint32_t isLoop;
1677         // is this nesting a function call?
1678         uint32_t isCall;
1679         // is this nesting a switch?
1680         uint32_t isSwitch;
1681     };
1682 
1683     struct SubgroupState2
1684     {
1685         // Currently executing
1686         Ballots activeMask;
1687         // Have executed a continue instruction in this loop
1688         Ballots continueMask;
1689         // index of the current if test or loop header
1690         uint32_t header;
1691         // number of loop iterations performed
1692         uint32_t tripCount;
1693         // is this nesting a loop?
1694         uint32_t isLoop;
1695         // is this nesting a function call?
1696         uint32_t isCall;
1697         // is this nesting a switch?
1698         uint32_t isSwitch;
1699         virtual ~SubgroupState2() = default;
SubgroupState2vkt::Reconvergence::__anone030def80111::RandomProgram::SubgroupState21700         SubgroupState2() : SubgroupState2(0)
1701         {
1702         }
SubgroupState2vkt::Reconvergence::__anone030def80111::RandomProgram::SubgroupState21703         SubgroupState2(uint32_t subgroupCount)
1704             : activeMask(subgroupCount)
1705             , continueMask(subgroupCount)
1706             , header()
1707             , tripCount()
1708             , isLoop()
1709             , isCall()
1710             , isSwitch()
1711         {
1712         }
1713     };
1714 
1715     struct Prerequisites
1716     {
1717     };
1718 
getPartitionBallotText()1719     virtual std::string getPartitionBallotText()
1720     {
1721         return "subgroupBallot(true)";
1722     }
1723 
printIfLocalInvocationIndex(std::stringstream & css,add_cref<FlowState> flow)1724     virtual void printIfLocalInvocationIndex(std::stringstream &css, add_cref<FlowState> flow)
1725     {
1726         printIndent(css);
1727         css << "if (gl_LocalInvocationIndex >= inputA.a[0x" << std::hex << flow.ops[flow.opsIndex].value << "]) {\n";
1728     }
1729 
printStore(std::stringstream & css,add_cref<FlowState> flow)1730     virtual void printStore(std::stringstream &css, add_cref<FlowState> flow)
1731     {
1732         printIndent(css);
1733         css << "outputC.loc[gl_LocalInvocationIndex]++;\n";
1734         printIndent(css);
1735         css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex].x = 0x" << std::hex
1736             << flow.ops[flow.opsIndex].value << ";\n";
1737     }
1738 
printBallot(std::stringstream & css,add_cref<FlowState>,bool endWithSemicolon=false)1739     virtual void printBallot(std::stringstream &css, add_cref<FlowState>, bool endWithSemicolon = false)
1740     {
1741         printIndent(css);
1742 
1743         css << "outputC.loc[gl_LocalInvocationIndex]++,";
1744         // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
1745         // a ballot, to make sure the ballot is "diverged enough". Don't do this for
1746         // subgroup_uniform_control_flow, since we only validate results that must be fully
1747         // reconverged.
1748         if (loopNesting > 0 && caseDef.testType == TT_MAXIMAL)
1749         {
1750             css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex] = " << getPartitionBallotText()
1751                 << ".xy";
1752         }
1753         else if (caseDef.isElect())
1754         {
1755             css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex].x = elect()";
1756         }
1757         else
1758         {
1759             css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex] = subgroupBallot(true).xy";
1760         }
1761         if (endWithSemicolon)
1762         {
1763             css << ";\n";
1764         }
1765     }
1766 
printCode(std::stringstream & functions,std::stringstream & main)1767     void printCode(std::stringstream &functions, std::stringstream &main)
1768     {
1769         std::stringstream *css = &main;
1770         indent                 = 4;
1771         loopNesting            = 0;
1772         int funcNum            = 0;
1773         int32_t i              = 0;
1774 
1775         auto makeFlowState = [&]() -> FlowState { return FlowState{ops, i, loopNesting, funcNum}; };
1776 
1777         for (; i < (int32_t)ops.size(); ++i)
1778         {
1779             switch (ops[i].type)
1780             {
1781             case OP_IF_MASK:
1782                 printIndent(*css);
1783                 if (ops[i].value == ~0ULL)
1784                 {
1785                     // This equality test will always succeed, since inputA.a[i] == i
1786                     int idx = deRandom_getUint32(&rnd) % 4;
1787                     *css << "if (inputA.a[" << idx << "] == " << idx << ") {\n";
1788                 }
1789                 else
1790                 {
1791                     const tcu::UVec4 v(ops[i].bvalue);
1792                     *css << std::hex << "if (testBit(uvec4("
1793                          << "0x" << v.x() << ", "
1794                          << "0x" << v.y() << ", "
1795                          << "0x" << v.z() << ", "
1796                          << "0x" << v.w() << std::dec << "), gl_SubgroupInvocationID)) {\n";
1797                 }
1798                 indent += 4;
1799                 break;
1800             case OP_IF_LOOPCOUNT:
1801                 printIndent(*css);
1802                 *css << "if (gl_SubgroupInvocationID == loopIdx" << loopNesting - 1 << ") {\n";
1803                 indent += 4;
1804                 break;
1805             case OP_IF_LOCAL_INVOCATION_INDEX:
1806                 printIfLocalInvocationIndex(*css, makeFlowState());
1807                 indent += 4;
1808                 break;
1809             case OP_ELSE_MASK:
1810             case OP_ELSE_LOOPCOUNT:
1811             case OP_ELSE_LOCAL_INVOCATION_INDEX:
1812                 indent -= 4;
1813                 printIndent(*css);
1814                 *css << "} else {\n";
1815                 indent += 4;
1816                 break;
1817             case OP_ENDIF:
1818                 indent -= 4;
1819                 printIndent(*css);
1820                 *css << "}\n";
1821                 break;
1822             case OP_BALLOT:
1823                 printBallot(*css, makeFlowState(), true);
1824                 break;
1825             case OP_STORE:
1826                 printStore(*css, makeFlowState());
1827                 break;
1828             case OP_BEGIN_FOR_VAR:
1829                 printIndent(*css);
1830                 *css << "for (int loopIdx" << loopNesting << " = 0;\n";
1831                 printIndent(*css);
1832                 *css << "         loopIdx" << loopNesting << " < gl_SubgroupInvocationID + 1;\n";
1833                 printIndent(*css);
1834                 *css << "         loopIdx" << loopNesting << "++) {\n";
1835                 indent += 4;
1836                 loopNesting++;
1837                 break;
1838             case OP_END_FOR_VAR:
1839                 loopNesting--;
1840                 indent -= 4;
1841                 printIndent(*css);
1842                 *css << "}\n";
1843                 break;
1844             case OP_BEGIN_FOR_UNIF:
1845                 printIndent(*css);
1846                 *css << "for (int loopIdx" << loopNesting << " = 0;\n";
1847                 printIndent(*css);
1848                 *css << "         loopIdx" << loopNesting << " < inputA.a[" << ops[i].value << "];\n";
1849                 printIndent(*css);
1850                 *css << "         loopIdx" << loopNesting << "++) {\n";
1851                 indent += 4;
1852                 loopNesting++;
1853                 break;
1854             case OP_END_FOR_UNIF:
1855                 loopNesting--;
1856                 indent -= 4;
1857                 printIndent(*css);
1858                 *css << "}\n";
1859                 break;
1860             case OP_BEGIN_FOR_INF:
1861                 printIndent(*css);
1862                 *css << "for (int loopIdx" << loopNesting << " = 0;;loopIdx" << loopNesting << "++,";
1863                 loopNesting++;
1864                 printBallot(*css, makeFlowState());
1865                 *css << ") {\n";
1866                 indent += 4;
1867                 break;
1868             case OP_END_FOR_INF:
1869                 loopNesting--;
1870                 indent -= 4;
1871                 printIndent(*css);
1872                 *css << "}\n";
1873                 break;
1874             case OP_BEGIN_DO_WHILE_UNIF:
1875                 printIndent(*css);
1876                 *css << "{\n";
1877                 indent += 4;
1878                 printIndent(*css);
1879                 *css << "int loopIdx" << loopNesting << " = 0;\n";
1880                 printIndent(*css);
1881                 *css << "do {\n";
1882                 indent += 4;
1883                 printIndent(*css);
1884                 *css << "loopIdx" << loopNesting << "++;\n";
1885                 loopNesting++;
1886                 break;
1887             case OP_END_DO_WHILE_UNIF:
1888                 loopNesting--;
1889                 indent -= 4;
1890                 printIndent(*css);
1891                 *css << "} while (loopIdx" << loopNesting << " < inputA.a[" << ops[(uint32_t)ops[i].value].value
1892                      << "]);\n";
1893                 indent -= 4;
1894                 printIndent(*css);
1895                 *css << "}\n";
1896                 break;
1897             case OP_BEGIN_DO_WHILE_INF:
1898                 printIndent(*css);
1899                 *css << "{\n";
1900                 indent += 4;
1901                 printIndent(*css);
1902                 *css << "int loopIdx" << loopNesting << " = 0;\n";
1903                 printIndent(*css);
1904                 *css << "do {\n";
1905                 indent += 4;
1906                 loopNesting++;
1907                 break;
1908             case OP_END_DO_WHILE_INF:
1909                 loopNesting--;
1910                 printIndent(*css);
1911                 *css << "loopIdx" << loopNesting << "++;\n";
1912                 indent -= 4;
1913                 printIndent(*css);
1914                 *css << "} while (true);\n";
1915                 indent -= 4;
1916                 printIndent(*css);
1917                 *css << "}\n";
1918                 break;
1919             case OP_BREAK:
1920                 printIndent(*css);
1921                 *css << "break;\n";
1922                 break;
1923             case OP_CONTINUE:
1924                 printIndent(*css);
1925                 *css << "continue;\n";
1926                 break;
1927             case OP_ELECT:
1928                 printIndent(*css);
1929                 *css << "if (subgroupElect()) {\n";
1930                 indent += 4;
1931                 break;
1932             case OP_RETURN:
1933                 printIndent(*css);
1934                 *css << "return;\n";
1935                 break;
1936             case OP_CALL_BEGIN:
1937                 printIndent(*css);
1938                 *css << "func" << funcNum << "(";
1939                 for (int32_t n = 0; n < loopNesting; ++n)
1940                 {
1941                     *css << "loopIdx" << n;
1942                     if (n != loopNesting - 1)
1943                         *css << ", ";
1944                 }
1945                 *css << ");\n";
1946                 css = &functions;
1947                 printIndent(*css);
1948                 *css << "void func" << funcNum << "(";
1949                 for (int32_t n = 0; n < loopNesting; ++n)
1950                 {
1951                     *css << "int loopIdx" << n;
1952                     if (n != loopNesting - 1)
1953                         *css << ", ";
1954                 }
1955                 *css << ") {\n";
1956                 indent += 4;
1957                 funcNum++;
1958                 break;
1959             case OP_CALL_END:
1960                 indent -= 4;
1961                 printIndent(*css);
1962                 *css << "}\n";
1963                 css = &main;
1964                 break;
1965             case OP_NOISE:
1966                 if (ops[i].value == 0)
1967                 {
1968                     printIndent(*css);
1969                     *css << "while (!subgroupElect()) {}\n";
1970                 }
1971                 else
1972                 {
1973                     printIndent(*css);
1974                     *css << "if (inputA.a[0] == 12345) {\n";
1975                     indent += 4;
1976                     printIndent(*css);
1977                     *css << "while (true) {\n";
1978                     indent += 4;
1979                     printBallot(*css, makeFlowState(), true);
1980                     indent -= 4;
1981                     printIndent(*css);
1982                     *css << "}\n";
1983                     indent -= 4;
1984                     printIndent(*css);
1985                     *css << "}\n";
1986                 }
1987                 break;
1988             case OP_SWITCH_UNIF_BEGIN:
1989                 printIndent(*css);
1990                 *css << "switch (inputA.a[" << ops[i].value << "]) {\n";
1991                 indent += 4;
1992                 break;
1993             case OP_SWITCH_VAR_BEGIN:
1994                 printIndent(*css);
1995                 *css << "switch (gl_SubgroupInvocationID & 3) {\n";
1996                 indent += 4;
1997                 break;
1998             case OP_SWITCH_LOOP_COUNT_BEGIN:
1999                 printIndent(*css);
2000                 *css << "switch (loopIdx" << ops[i].value << ") {\n";
2001                 indent += 4;
2002                 break;
2003             case OP_SWITCH_END:
2004                 indent -= 4;
2005                 printIndent(*css);
2006                 *css << "}\n";
2007                 break;
2008             case OP_CASE_MASK_BEGIN:
2009                 for (int32_t b = 0; b < 32; ++b)
2010                 {
2011                     if ((1u << b) & ops[i].caseValue)
2012                     {
2013                         printIndent(*css);
2014                         *css << "case " << b << ":\n";
2015                     }
2016                 }
2017                 printIndent(*css);
2018                 *css << "{\n";
2019                 indent += 4;
2020                 break;
2021             case OP_CASE_LOOP_COUNT_BEGIN:
2022                 if (ops[i].caseValue == 0xFFFFFFFF)
2023                 {
2024                     printIndent(*css);
2025                     *css << "default: {\n";
2026                 }
2027                 else
2028                 {
2029                     printIndent(*css);
2030                     *css << "case " << ops[i].caseValue << ": {\n";
2031                 }
2032                 indent += 4;
2033                 break;
2034             case OP_CASE_END:
2035                 printIndent(*css);
2036                 *css << "break;\n";
2037                 indent -= 4;
2038                 printIndent(*css);
2039                 *css << "}\n";
2040                 break;
2041             default:
2042                 DE_ASSERT(0);
2043                 break;
2044             }
2045         }
2046     }
2047 
2048     // Simulate execution of the program. If countOnly is true, just return
2049     // the max number of outputs written. If it's false, store out the result
2050     // values to ref.
2051     virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) = 0;
2052 
execute(qpWatchDog * watchDog,bool countOnly,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,add_cref<std::vector<uint32_t>> outputP={},const tcu::UVec4 * cmp=nullptr,const uint32_t primitiveID=(~0u))2053     virtual uint32_t execute(qpWatchDog *watchDog, bool countOnly, const uint32_t subgroupSize,
2054                              const uint32_t fragmentStride, const uint32_t primitiveStride,
2055                              add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2056                              add_cref<std::vector<uint32_t>> outputP = {}, const tcu::UVec4 *cmp = nullptr,
2057                              const uint32_t primitiveID = (~0u))
2058     {
2059         // Per-invocation output location counters
2060         std::vector<uint32_t> outLoc;
2061         std::vector<SubgroupState2> stateStack;
2062         uint32_t subgroupCount;
2063         uint32_t logFailureCount;
2064         auto prerequisites = makePrerequisites(outputP, subgroupSize, fragmentStride, primitiveStride, stateStack,
2065                                                outLoc, subgroupCount);
2066         const Ballot fullSubgroupMask = subgroupSizeToMask(subgroupSize, subgroupCount);
2067 
2068         logFailureCount = 10u;
2069         nesting         = 0;
2070         loopNesting     = 0;
2071 
2072         int32_t i          = 0;
2073         uint32_t loopCount = 0;
2074 
2075         while (i < (int32_t)ops.size())
2076         {
2077             add_cref<Ballots> activeMask = stateStack[nesting].activeMask;
2078 
2079             if ((loopCount % 5000) == 0 && watchDog)
2080                 qpWatchDog_touch(watchDog);
2081 
2082             switch (ops[i].type)
2083             {
2084             case OP_BALLOT:
2085                 // Flag that this ballot is workgroup-nonuniform
2086                 if (caseDef.isWUCF() && activeMask.any() && !activeMask.all())
2087                     ops[i].caseValue = 1;
2088 
2089                 if (caseDef.isSUCF())
2090                 {
2091                     for (uint32_t id = 0; id < invocationStride; id += subgroupSize)
2092                     {
2093                         const Ballot subgroupMask = bitsetToBallot(activeMask, fullSubgroupMask, subgroupSize, id);
2094                         // Flag that this ballot is subgroup-nonuniform
2095                         if (subgroupMask != 0 && subgroupMask != fullSubgroupMask)
2096                             ops[i].caseValue = 1;
2097                     }
2098                 }
2099 
2100                 simulateBallot(countOnly, activeMask, primitiveID, i, outLoc, ref, log, prerequisites, logFailureCount,
2101                                (i > 0 ? ops[i - 1].type : OP_BALLOT), cmp);
2102                 break;
2103             case OP_STORE:
2104                 simulateStore(countOnly, stateStack[nesting].activeMask, primitiveID, ops[i].value, outLoc, ref, log,
2105                               prerequisites, logFailureCount, (i > 0 ? ops[i - 1].type : OP_STORE), cmp);
2106                 break;
2107             case OP_IF_MASK:
2108                 nesting++;
2109                 stateStack[nesting].activeMask =
2110                     stateStack[nesting - 1].activeMask & ballotsFromBallot(ops[i].bvalue, subgroupSize, subgroupCount);
2111                 stateStack[nesting].header   = i;
2112                 stateStack[nesting].isLoop   = 0;
2113                 stateStack[nesting].isSwitch = 0;
2114                 break;
2115             case OP_ELSE_MASK:
2116                 stateStack[nesting].activeMask =
2117                     stateStack[nesting - 1].activeMask &
2118                     ~ballotsFromBallot(ops[stateStack[nesting].header].bvalue, subgroupSize, subgroupCount);
2119                 break;
2120             case OP_IF_LOOPCOUNT:
2121             {
2122                 uint32_t n = nesting;
2123                 while (!stateStack[n].isLoop)
2124                     n--;
2125                 const Ballot tripBallot = Ballot::withSetBit(stateStack[n].tripCount);
2126 
2127                 nesting++;
2128                 stateStack[nesting].activeMask =
2129                     stateStack[nesting - 1].activeMask & ballotsFromBallot(tripBallot, subgroupSize, subgroupCount);
2130                 stateStack[nesting].header   = i;
2131                 stateStack[nesting].isLoop   = 0;
2132                 stateStack[nesting].isSwitch = 0;
2133                 break;
2134             }
2135             case OP_ELSE_LOOPCOUNT:
2136             {
2137                 uint32_t n = nesting;
2138                 while (!stateStack[n].isLoop)
2139                     n--;
2140                 const Ballot tripBallot = Ballot::withSetBit(stateStack[n].tripCount);
2141 
2142                 stateStack[nesting].activeMask =
2143                     stateStack[nesting - 1].activeMask & ~ballotsFromBallot(tripBallot, subgroupSize, subgroupCount);
2144                 break;
2145             }
2146             case OP_IF_LOCAL_INVOCATION_INDEX:
2147             {
2148                 // all bits >= N
2149                 Ballots mask(subgroupCount);
2150                 const uint32_t maxID = subgroupCount * subgroupSize;
2151                 for (uint32_t id = static_cast<uint32_t>(ops[i].value); id < maxID; ++id)
2152                 {
2153                     mask.set(Ballots::findBit(id, subgroupSize));
2154                 }
2155 
2156                 nesting++;
2157                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask & mask;
2158                 stateStack[nesting].header     = i;
2159                 stateStack[nesting].isLoop     = 0;
2160                 stateStack[nesting].isSwitch   = 0;
2161                 break;
2162             }
2163             case OP_ELSE_LOCAL_INVOCATION_INDEX:
2164             {
2165                 // all bits < N
2166                 Ballots mask(subgroupCount);
2167                 const uint32_t maxID = subgroupCount * subgroupSize;
2168                 for (uint32_t id = 0u; id < static_cast<uint32_t>(ops[i].value) && id < maxID; ++id)
2169                 {
2170                     mask.set(Ballots::findBit(id, subgroupSize));
2171                 }
2172 
2173                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask & mask;
2174                 break;
2175             }
2176             case OP_ENDIF:
2177                 nesting--;
2178                 break;
2179             case OP_BEGIN_FOR_UNIF:
2180                 // XXX TODO: We don't handle a for loop with zero iterations
2181                 nesting++;
2182                 loopNesting++;
2183                 stateStack[nesting].activeMask   = stateStack[nesting - 1].activeMask;
2184                 stateStack[nesting].header       = i;
2185                 stateStack[nesting].tripCount    = 0;
2186                 stateStack[nesting].isLoop       = 1;
2187                 stateStack[nesting].isSwitch     = 0;
2188                 stateStack[nesting].continueMask = 0;
2189                 break;
2190             case OP_END_FOR_UNIF:
2191                 stateStack[nesting].tripCount++;
2192                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
2193                 stateStack[nesting].continueMask = 0;
2194                 if (stateStack[nesting].tripCount < ops[stateStack[nesting].header].value &&
2195                     stateStack[nesting].activeMask.any())
2196                 {
2197                     i = stateStack[nesting].header + 1;
2198                     continue;
2199                 }
2200                 else
2201                 {
2202                     loopNesting--;
2203                     nesting--;
2204                 }
2205                 break;
2206             case OP_BEGIN_DO_WHILE_UNIF:
2207                 // XXX TODO: We don't handle a for loop with zero iterations
2208                 nesting++;
2209                 loopNesting++;
2210                 stateStack[nesting].activeMask   = stateStack[nesting - 1].activeMask;
2211                 stateStack[nesting].header       = i;
2212                 stateStack[nesting].tripCount    = 1;
2213                 stateStack[nesting].isLoop       = 1;
2214                 stateStack[nesting].isSwitch     = 0;
2215                 stateStack[nesting].continueMask = 0;
2216                 break;
2217             case OP_END_DO_WHILE_UNIF:
2218                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
2219                 stateStack[nesting].continueMask = 0;
2220                 if (stateStack[nesting].tripCount < ops[stateStack[nesting].header].value &&
2221                     stateStack[nesting].activeMask.any())
2222                 {
2223                     i = stateStack[nesting].header + 1;
2224                     stateStack[nesting].tripCount++;
2225                     continue;
2226                 }
2227                 else
2228                 {
2229                     loopNesting--;
2230                     nesting--;
2231                 }
2232                 break;
2233             case OP_BEGIN_FOR_VAR:
2234                 // XXX TODO: We don't handle a for loop with zero iterations
2235                 nesting++;
2236                 loopNesting++;
2237                 stateStack[nesting].activeMask   = stateStack[nesting - 1].activeMask;
2238                 stateStack[nesting].header       = i;
2239                 stateStack[nesting].tripCount    = 0;
2240                 stateStack[nesting].isLoop       = 1;
2241                 stateStack[nesting].isSwitch     = 0;
2242                 stateStack[nesting].continueMask = 0;
2243                 break;
2244             case OP_END_FOR_VAR:
2245             {
2246                 stateStack[nesting].tripCount++;
2247                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
2248                 stateStack[nesting].continueMask = 0;
2249                 Ballot tripBallot;
2250                 if (subgroupSize != stateStack[nesting].tripCount)
2251                 {
2252                     for (uint32_t bit = stateStack[nesting].tripCount; bit < tripBallot.size(); ++bit)
2253                         tripBallot.set(bit);
2254                 }
2255                 stateStack[nesting].activeMask &= ballotsFromBallot(tripBallot, subgroupSize, subgroupCount);
2256 
2257                 if (stateStack[nesting].activeMask.any())
2258                 {
2259                     i = stateStack[nesting].header + 1;
2260                     continue;
2261                 }
2262                 else
2263                 {
2264                     loopNesting--;
2265                     nesting--;
2266                 }
2267                 break;
2268             }
2269             case OP_BEGIN_FOR_INF:
2270             case OP_BEGIN_DO_WHILE_INF:
2271                 nesting++;
2272                 loopNesting++;
2273                 stateStack[nesting].activeMask   = stateStack[nesting - 1].activeMask;
2274                 stateStack[nesting].header       = i;
2275                 stateStack[nesting].tripCount    = 0;
2276                 stateStack[nesting].isLoop       = 1;
2277                 stateStack[nesting].isSwitch     = 0;
2278                 stateStack[nesting].continueMask = 0;
2279                 break;
2280             case OP_END_FOR_INF:
2281                 stateStack[nesting].tripCount++;
2282                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
2283                 stateStack[nesting].continueMask = 0;
2284                 if (stateStack[nesting].activeMask.any())
2285                 {
2286                     // output expected OP_BALLOT values
2287                     simulateBallot(countOnly, stateStack[nesting].activeMask, primitiveID, i, outLoc, ref, log,
2288                                    prerequisites, logFailureCount, (i > 0 ? ops[i - 1].type : OP_BALLOT), cmp);
2289 
2290                     i = stateStack[nesting].header + 1;
2291                     continue;
2292                 }
2293                 else
2294                 {
2295                     loopNesting--;
2296                     nesting--;
2297                 }
2298                 break;
2299             case OP_END_DO_WHILE_INF:
2300                 stateStack[nesting].tripCount++;
2301                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
2302                 stateStack[nesting].continueMask = 0;
2303                 if (stateStack[nesting].activeMask.any())
2304                 {
2305                     i = stateStack[nesting].header + 1;
2306                     continue;
2307                 }
2308                 else
2309                 {
2310                     loopNesting--;
2311                     nesting--;
2312                 }
2313                 break;
2314             case OP_BREAK:
2315             {
2316                 uint32_t n         = nesting;
2317                 const Ballots mask = stateStack[nesting].activeMask;
2318                 while (true)
2319                 {
2320                     stateStack[n].activeMask &= ~mask;
2321                     if (stateStack[n].isLoop || stateStack[n].isSwitch)
2322                         break;
2323 
2324                     n--;
2325                 }
2326             }
2327             break;
2328             case OP_CONTINUE:
2329             {
2330                 uint32_t n         = nesting;
2331                 const Ballots mask = stateStack[nesting].activeMask;
2332                 while (true)
2333                 {
2334                     stateStack[n].activeMask &= ~mask;
2335                     if (stateStack[n].isLoop)
2336                     {
2337                         stateStack[n].continueMask |= mask;
2338                         break;
2339                     }
2340                     n--;
2341                 }
2342             }
2343             break;
2344             case OP_ELECT:
2345             {
2346                 nesting++;
2347                 stateStack[nesting].activeMask = bitsetElect(stateStack[nesting - 1].activeMask);
2348                 stateStack[nesting].header     = i;
2349                 stateStack[nesting].isLoop     = 0;
2350                 stateStack[nesting].isSwitch   = 0;
2351             }
2352             break;
2353             case OP_RETURN:
2354             {
2355                 const Ballots mask = stateStack[nesting].activeMask;
2356                 for (int32_t n = nesting; n >= 0; --n)
2357                 {
2358                     stateStack[n].activeMask &= ~mask;
2359                     if (stateStack[n].isCall)
2360                         break;
2361                 }
2362             }
2363             break;
2364 
2365             case OP_CALL_BEGIN:
2366                 nesting++;
2367                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
2368                 stateStack[nesting].isLoop     = 0;
2369                 stateStack[nesting].isSwitch   = 0;
2370                 stateStack[nesting].isCall     = 1;
2371                 break;
2372             case OP_CALL_END:
2373                 stateStack[nesting].isCall = 0;
2374                 nesting--;
2375                 break;
2376             case OP_NOISE:
2377                 break;
2378 
2379             case OP_SWITCH_UNIF_BEGIN:
2380             case OP_SWITCH_VAR_BEGIN:
2381             case OP_SWITCH_LOOP_COUNT_BEGIN:
2382                 nesting++;
2383                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
2384                 stateStack[nesting].header     = i;
2385                 stateStack[nesting].isLoop     = 0;
2386                 stateStack[nesting].isSwitch   = 1;
2387                 break;
2388             case OP_SWITCH_END:
2389                 nesting--;
2390                 break;
2391             case OP_CASE_MASK_BEGIN:
2392                 stateStack[nesting].activeMask =
2393                     stateStack[nesting - 1].activeMask & ballotsFromBallot(ops[i].bvalue, subgroupSize, subgroupCount);
2394                 break;
2395             case OP_CASE_LOOP_COUNT_BEGIN:
2396             {
2397                 uint32_t n = nesting;
2398                 uint32_t l = loopNesting;
2399 
2400                 while (true)
2401                 {
2402                     if (stateStack[n].isLoop)
2403                     {
2404                         l--;
2405                         if (l == ops[stateStack[nesting].header].value)
2406                             break;
2407                     }
2408                     n--;
2409                 }
2410 
2411                 if ((Ballot::withSetBit(stateStack[n].tripCount) & Ballot(ops[i].bvalue)).any())
2412                     stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
2413                 else
2414                     stateStack[nesting].activeMask = 0;
2415                 break;
2416             }
2417             case OP_CASE_END:
2418                 break;
2419 
2420             default:
2421                 DE_ASSERT(0);
2422                 break;
2423             }
2424             i++;
2425             loopCount++;
2426         }
2427         uint32_t maxLoc = 0;
2428         for (uint32_t id = 0; id < (uint32_t)outLoc.size(); ++id)
2429             maxLoc = de::max(maxLoc, outLoc[id]);
2430 
2431         return maxLoc;
2432     }
2433 
hasUCF() const2434     bool hasUCF() const
2435     {
2436         for (int32_t i = 0; i < (int32_t)ops.size(); ++i)
2437         {
2438             if (ops[i].type == OP_BALLOT && ops[i].caseValue == 0)
2439                 return true;
2440         }
2441         return false;
2442     }
2443 
2444 protected:
makePrerequisites(add_cref<std::vector<uint32_t>> outputP,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<SubgroupState2>> stateStack,add_ref<std::vector<uint32_t>> outLoc,add_ref<uint32_t> subgroupCount)2445     virtual std::shared_ptr<Prerequisites> makePrerequisites(add_cref<std::vector<uint32_t>> outputP,
2446                                                              const uint32_t subgroupSize, const uint32_t fragmentStride,
2447                                                              const uint32_t primitiveStride,
2448                                                              add_ref<std::vector<SubgroupState2>> stateStack,
2449                                                              add_ref<std::vector<uint32_t>> outLoc,
2450                                                              add_ref<uint32_t> subgroupCount)
2451     {
2452         DE_UNREF(outputP);
2453         DE_UNREF(subgroupSize);
2454         DE_UNREF(fragmentStride);
2455         DE_UNREF(primitiveStride);
2456         DE_UNREF(stateStack);
2457         DE_UNREF(outLoc);
2458         DE_UNREF(subgroupCount);
2459         return std::make_shared<Prerequisites>();
2460     }
2461 
simulateBallot(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t primitiveID,const int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)2462     virtual void simulateBallot(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t primitiveID,
2463                                 const int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
2464                                 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2465                                 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
2466                                 const OPType reason, const tcu::UVec4 *cmp)
2467     {
2468         DE_UNREF(countOnly);
2469         DE_UNREF(activeMask);
2470         DE_UNREF(primitiveID);
2471         DE_UNREF(opsIndex);
2472         DE_UNREF(outLoc);
2473         DE_UNREF(ref);
2474         DE_UNREF(log);
2475         DE_UNREF(prerequisites);
2476         DE_UNREF(logFailureCount);
2477         DE_UNREF(reason);
2478         DE_UNREF(cmp);
2479     }
2480 
simulateStore(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t primitiveID,const uint64_t storeValue,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)2481     virtual void simulateStore(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t primitiveID,
2482                                const uint64_t storeValue, add_ref<std::vector<uint32_t>> outLoc,
2483                                add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2484                                std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
2485                                const OPType reason, const tcu::UVec4 *cmp)
2486     {
2487         DE_UNREF(countOnly);
2488         DE_UNREF(activeMask);
2489         DE_UNREF(primitiveID);
2490         DE_UNREF(storeValue);
2491         DE_UNREF(outLoc);
2492         DE_UNREF(ref);
2493         DE_UNREF(log);
2494         DE_UNREF(prerequisites);
2495         DE_UNREF(logFailureCount);
2496         DE_UNREF(reason);
2497         DE_UNREF(cmp);
2498     }
2499 };
2500 
2501 class ComputeRandomProgram : public RandomProgram
2502 {
2503 public:
ComputeRandomProgram(const CaseDef & c)2504     ComputeRandomProgram(const CaseDef &c) : RandomProgram(c, uint32_t(c.sizeX * c.sizeY))
2505     {
2506         DE_ASSERT(c.shaderStage == VK_SHADER_STAGE_COMPUTE_BIT);
2507     }
2508     virtual ~ComputeRandomProgram() = default;
2509 
simulate(bool countOnly,uint32_t subgroupSize,add_ref<std::vector<uint64_t>> ref)2510     virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) override
2511     {
2512         DE_ASSERT(false);
2513         // Do not use this method, to simulate generated program use simulate2 instead
2514         DE_UNREF(countOnly);
2515         DE_UNREF(subgroupSize);
2516         DE_UNREF(ref);
2517         return 0;
2518     }
2519 
2520     struct ComputePrerequisites : Prerequisites
2521     {
2522         const uint32_t subgroupSize;
2523         const uint32_t subgroupCount;
2524         const Ballot subgroupSizeMask;
2525         std::vector<std::pair<bool, tcu::UVec4>> ballots;
ComputePrerequisitesvkt::Reconvergence::__anone030def80111::ComputeRandomProgram::ComputePrerequisites2526         ComputePrerequisites(uint32_t subgroupSize_, uint32_t subgroupCount_)
2527             : subgroupSize(subgroupSize_)
2528             , subgroupCount(subgroupCount_)
2529             , subgroupSizeMask(subgroupSizeToMask(subgroupSize, subgroupCount))
2530             , ballots(subgroupCount_)
2531         {
2532         }
2533     };
2534 
printBallot(add_ref<std::stringstream> css,add_cref<FlowState>,bool endWithSemicolon=false)2535     virtual void printBallot(add_ref<std::stringstream> css, add_cref<FlowState>,
2536                              bool endWithSemicolon = false) override
2537     {
2538         printIndent(css);
2539 
2540         css << "outputC.loc[gl_LocalInvocationIndex]++,";
2541         // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
2542         // a ballot, to make sure the ballot is "diverged enough". Don't do this for
2543         // subgroup_uniform_control_flow, since we only validate results that must be fully
2544         // reconverged.
2545         if (loopNesting > 0 && caseDef.testType == TT_MAXIMAL)
2546         {
2547             css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex] = " << getPartitionBallotText();
2548         }
2549         else if (caseDef.isElect())
2550         {
2551             css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex].x = elect()";
2552         }
2553         else
2554         {
2555             css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex] = subgroupBallot(true)";
2556         }
2557         if (endWithSemicolon)
2558         {
2559             css << ";\n";
2560         }
2561     }
2562 
2563 protected:
simulateBallot(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t unusedPrimitiveID,const int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)2564     virtual void simulateBallot(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t unusedPrimitiveID,
2565                                 const int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
2566                                 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2567                                 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
2568                                 const OPType reason, const tcu::UVec4 *cmp) override
2569     {
2570         DE_UNREF(unusedPrimitiveID);
2571         DE_UNREF(log);
2572         DE_UNREF(logFailureCount);
2573         DE_UNREF(reason);
2574         DE_UNREF(cmp);
2575         auto pre                     = static_pointer_cast<ComputePrerequisites>(prerequisites);
2576         const uint32_t subgroupCount = activeMask.subgroupCount();
2577         const uint32_t subgroupSize  = pre->subgroupSize;
2578 
2579         std::fill_n(pre->ballots.begin(), subgroupCount, std::pair<bool, tcu::UVec4>());
2580 
2581         for (uint32_t id = 0; id < invocationStride; ++id)
2582         {
2583             if (activeMask.test((Ballots::findBit(id, subgroupSize))))
2584             {
2585                 if (countOnly)
2586                 {
2587                     outLoc[id]++;
2588                 }
2589                 else
2590                 {
2591                     if (ops[opsIndex].caseValue)
2592                     {
2593                         // Emit a magic value to indicate that we shouldn't validate this ballot
2594                         ref[(outLoc[id]++) * invocationStride + id] =
2595                             bitsetToBallot(0x12345678, subgroupCount, subgroupSize, id);
2596                     }
2597                     else
2598                     {
2599                         add_ref<std::pair<bool, tcu::UVec4>> info(pre->ballots.at(id / subgroupSize));
2600                         if (false == info.first)
2601                         {
2602                             info.first  = true;
2603                             info.second = bitsetToBallot(activeMask, pre->subgroupSizeMask, subgroupSize, id);
2604                         }
2605                         ref[(outLoc[id]++) * invocationStride + id] = info.second;
2606                     }
2607                 }
2608             }
2609         }
2610     }
2611 
simulateStore(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t unusedPrimitiveID,const uint64_t storeValue,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)2612     virtual void simulateStore(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t unusedPrimitiveID,
2613                                const uint64_t storeValue, add_ref<std::vector<uint32_t>> outLoc,
2614                                add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2615                                std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
2616                                const OPType reason, const tcu::UVec4 *cmp) override
2617     {
2618         DE_UNREF(unusedPrimitiveID);
2619         DE_UNREF(log);
2620         DE_UNREF(logFailureCount);
2621         DE_UNREF(reason);
2622         DE_UNREF(cmp);
2623         const uint32_t subgroupSize = static_pointer_cast<ComputePrerequisites>(prerequisites)->subgroupSize;
2624         for (uint32_t id = 0; id < invocationStride; ++id)
2625         {
2626             if (activeMask.test(Ballots::findBit(id, subgroupSize)))
2627             {
2628                 if (countOnly)
2629                     outLoc[id]++;
2630                 else
2631                     ref[(outLoc[id]++) * invocationStride + id][0] = uint32_t(storeValue & 0xFFFFFFFF);
2632             }
2633         }
2634     }
2635 
makePrerequisites(add_cref<std::vector<uint32_t>> outputP,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<SubgroupState2>> stateStack,add_ref<std::vector<uint32_t>> outLoc,add_ref<uint32_t> subgroupCount)2636     virtual std::shared_ptr<Prerequisites> makePrerequisites(add_cref<std::vector<uint32_t>> outputP,
2637                                                              const uint32_t subgroupSize, const uint32_t fragmentStride,
2638                                                              const uint32_t primitiveStride,
2639                                                              add_ref<std::vector<SubgroupState2>> stateStack,
2640                                                              add_ref<std::vector<uint32_t>> outLoc,
2641                                                              add_ref<uint32_t> subgroupCount) override
2642     {
2643         DE_UNREF(outputP);
2644         DE_UNREF(fragmentStride);
2645         DE_ASSERT(invocationStride == primitiveStride);
2646         subgroupCount      = ROUNDUP(invocationStride, subgroupSize) / subgroupSize;
2647         auto prerequisites = std::make_shared<ComputePrerequisites>(subgroupSize, subgroupCount);
2648         stateStack.resize(10u, SubgroupState2(subgroupCount));
2649         outLoc.resize(primitiveStride, 0u);
2650         add_ref<Ballots> activeMask(stateStack.at(0).activeMask);
2651         for (uint32_t id = 0; id < invocationStride; ++id)
2652         {
2653             activeMask.set(Ballots::findBit(id, subgroupSize));
2654         }
2655         return prerequisites;
2656     }
2657 };
2658 
2659 class FragmentRandomProgram : public RandomProgram
2660 {
2661 public:
2662 #define BALLOT_STACK_SIZE_DEFVAL_LINE (__LINE__ + 1)
2663     static constexpr const uint32_t experimentalOutLocSize      = 16384;
2664     static constexpr const uint32_t conditionIfInvocationStride = 511u;
FragmentRandomProgram(const CaseDef & c)2665     FragmentRandomProgram(const CaseDef &c) : RandomProgram(c, conditionIfInvocationStride)
2666     {
2667         DE_ASSERT(caseDef.testType == TT_MAXIMAL);
2668         DE_ASSERT(c.shaderStage == VK_SHADER_STAGE_FRAGMENT_BIT);
2669     }
2670     virtual ~FragmentRandomProgram() = default;
2671 
create(const CaseDef & c)2672     static de::MovePtr<FragmentRandomProgram> create(const CaseDef &c)
2673     {
2674         return de::MovePtr<FragmentRandomProgram>(new FragmentRandomProgram(c));
2675     }
2676 
printIfLocalInvocationIndex(add_ref<std::stringstream> css,add_cref<FlowState> flow)2677     virtual void printIfLocalInvocationIndex(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
2678     {
2679         printIndent(css);
2680         css << "if (invocationIndex() >= inputA.a[0x" << std::hex << flow.ops[flow.opsIndex].value << "]) {\n";
2681     }
2682 
printStore(add_ref<std::stringstream> css,add_cref<FlowState> flow)2683     virtual void printStore(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
2684     {
2685         printIndent(css);
2686         css << "storeValue(outLoc++, 0x" << std::hex << flow.ops[flow.opsIndex].value << ");\n";
2687     }
2688 
printBallot(add_ref<std::stringstream> css,add_cref<FlowState>,bool endWidthSemicolon=false)2689     virtual void printBallot(add_ref<std::stringstream> css, add_cref<FlowState>,
2690                              bool endWidthSemicolon = false) override
2691     {
2692         printIndent(css);
2693         // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
2694         // a ballot, to make sure the ballot is "diverged enough". Don't do this for
2695         // subgroup_uniform_control_flow, since we only validate results that must be fully
2696         // reconverged.
2697         if (loopNesting > 0)
2698         {
2699             css << "storeBallot(outLoc++)";
2700         }
2701         else
2702         {
2703             css << getPartitionBallotText();
2704         }
2705         if (endWidthSemicolon)
2706         {
2707             css << ";\n";
2708         }
2709     }
2710 
getPartitionBallotText()2711     virtual std::string getPartitionBallotText() override
2712     {
2713         return "storeBallot(outLoc++)";
2714     }
2715 
genIf(IFType ifType,uint32_t maxLocalIndexCmp=0u)2716     virtual void genIf(IFType ifType, uint32_t maxLocalIndexCmp = 0u) override
2717     {
2718         DE_UNREF(maxLocalIndexCmp);
2719         RandomProgram::genIf(ifType, conditionIfInvocationStride);
2720     }
2721 
2722     struct Arrangement : Prerequisites, ReconvergenceTestFragmentInstance::Arrangement
2723     {
2724         const uint32_t m_width;
2725         const uint32_t m_height;
2726         const uint32_t m_subgroupSize;
2727         const uint32_t m_fragmentStride;
2728         const uint32_t m_primitiveStride;
2729         const uint32_t m_subgroupCount;
2730         const Ballots m_initialBallots;
2731         const Ballots m_nonHelperInitialBallots;
2732         const uint32_t m_invocationStride;
2733         const std::vector<std::vector<uint32_t>> m_fragmentSubgroups;
Arrangementvkt::Reconvergence::__anone030def80111::FragmentRandomProgram::Arrangement2734         Arrangement(add_cref<std::vector<uint32_t>> info, uint32_t width, uint32_t height, uint32_t subgroupSize,
2735                     uint32_t primitiveStride)
2736             : m_width(width)
2737             , m_height(height)
2738             , m_subgroupSize(subgroupSize)
2739             , m_fragmentStride(width * height)
2740             , m_primitiveStride(primitiveStride)
2741             , m_subgroupCount(calcSubgroupCount(info, primitiveStride, m_fragmentStride))
2742             , m_initialBallots(makeInitialBallots(info, primitiveStride, m_fragmentStride, false))
2743             , m_nonHelperInitialBallots(makeInitialBallots(info, primitiveStride, m_fragmentStride, true))
2744             , m_invocationStride(calcInvocationStride(info, subgroupSize, primitiveStride, m_fragmentStride))
2745             , m_fragmentSubgroups(makeFragmentSubgroups(info, subgroupSize, primitiveStride, m_fragmentStride))
2746         {
2747         }
calcSubgroupCountvkt::Reconvergence::__anone030def80111::FragmentRandomProgram::Arrangement2748         static uint32_t calcSubgroupCount(add_cref<std::vector<uint32_t>> info, const uint32_t primitiveStride,
2749                                           const uint32_t fragmentStride)
2750         {
2751             const uint32_t cc = fragmentStride * primitiveStride;
2752             std::set<uint32_t> s;
2753             uint32_t subgroupID;
2754             uint32_t subgroupInvocationID;
2755             uint32_t isHelperInvocation;
2756             for (uint32_t c = 0u; c < cc; ++c)
2757             {
2758                 if (validID(info.at(c), subgroupID, subgroupInvocationID, isHelperInvocation))
2759                     s.insert(subgroupID);
2760             }
2761             const uint32_t gMin = *s.begin();
2762             DE_UNREF(gMin);
2763             const uint32_t gMax = *std::next(s.begin(), (s.size() - 1u));
2764             DE_UNREF(gMax);
2765             DE_ASSERT(gMin == 0u);
2766             DE_ASSERT(gMax == (s.size() - 1u));
2767             return static_cast<uint32_t>(s.size());
2768         }
calcInvocationStridevkt::Reconvergence::__anone030def80111::FragmentRandomProgram::Arrangement2769         static uint32_t calcInvocationStride(add_cref<std::vector<uint32_t>> info, const uint32_t subgroupSize,
2770                                              const uint32_t primitiveStride, const uint32_t fragmentStride)
2771         {
2772             return calcSubgroupCount(info, fragmentStride, primitiveStride) * subgroupSize;
2773         }
makeInitialBallotsvkt::Reconvergence::__anone030def80111::FragmentRandomProgram::Arrangement2774         static Ballots makeInitialBallots(add_cref<std::vector<uint32_t>> info, const uint32_t primitiveStride,
2775                                           const uint32_t fragmentStride, bool excludeHelpers)
2776         {
2777             uint32_t subgroupID;
2778             uint32_t subgroupInvocationID;
2779             uint32_t isHelperInvocation;
2780             Ballots b(calcSubgroupCount(info, fragmentStride, primitiveStride));
2781             const uint32_t cc = fragmentStride * primitiveStride;
2782             for (uint32_t c = 0u; c < cc; ++c)
2783             {
2784                 if (validID(info.at(c), subgroupID, subgroupInvocationID, isHelperInvocation))
2785                 {
2786                     if (!(excludeHelpers && (isHelperInvocation != 0)))
2787                         b.at(subgroupID).set(subgroupInvocationID);
2788                 }
2789             }
2790             return b;
2791         }
2792         // Fully Qualified Invocation Name
fqinvkt::Reconvergence::__anone030def80111::FragmentRandomProgram::Arrangement2793         static uint32_t fqin(uint32_t maybeHelperFragmentFQIN, add_ref<uint32_t> isHelperInvocation)
2794         {
2795             isHelperInvocation = maybeHelperFragmentFQIN >> 31;
2796             return (maybeHelperFragmentFQIN & 0x7FFFFFFF);
2797         }
makeFragmentSubgroupsvkt::Reconvergence::__anone030def80111::FragmentRandomProgram::Arrangement2798         static auto makeFragmentSubgroups(add_cref<std::vector<uint32_t>> info, const uint32_t subgroupSize,
2799                                           const uint32_t primitiveStride, const uint32_t fragmentStride)
2800             -> std::vector<std::vector<uint32_t>>
2801         {
2802             const uint32_t subgroupCount = calcSubgroupCount(info, fragmentStride, primitiveStride);
2803             std::vector<std::vector<uint32_t>> map(primitiveStride);
2804             for (uint32_t p = 0u; p < primitiveStride; ++p)
2805                 map[p].resize(fragmentStride, (subgroupCount * subgroupSize));
2806 
2807             uint32_t subgroupID;
2808             uint32_t subgroupInvocationID;
2809             uint32_t isHelperInvocation;
2810             for (uint32_t p = 0u; p < primitiveStride; ++p)
2811                 for (uint32_t f = 0u; f < fragmentStride; ++f)
2812                 {
2813                     const uint32_t sgid = info.at(f * primitiveStride + p);
2814                     if (validID(sgid, subgroupID, subgroupInvocationID, isHelperInvocation))
2815                         map.at(p).at(f) =
2816                             (subgroupID * subgroupSize + subgroupInvocationID) | (isHelperInvocation << 31);
2817                 }
2818             return map;
2819         }
calcRealInvocationCountvkt::Reconvergence::__anone030def80111::FragmentRandomProgram::Arrangement2820         static uint32_t calcRealInvocationCount(add_cref<std::vector<uint32_t>> info, uint32_t primitiveStride,
2821                                                 uint32_t fragmentStride)
2822         {
2823             const uint32_t cc = fragmentStride * primitiveStride;
2824             uint32_t n        = 0u;
2825             for (uint32_t c = 0u; c < cc; ++c)
2826             {
2827                 if (info[c])
2828                     ++n;
2829             }
2830             return n;
2831         }
2832 
2833     private:
validIDvkt::Reconvergence::__anone030def80111::FragmentRandomProgram::Arrangement2834         static bool validID(const uint32_t id)
2835         {
2836             uint32_t subgroupID;
2837             DE_UNREF(subgroupID);
2838             uint32_t subgroupInvocationID;
2839             DE_UNREF(subgroupInvocationID);
2840             uint32_t isHelperInvocation;
2841             DE_UNREF(isHelperInvocation);
2842             return validID(id, subgroupID, subgroupInvocationID, isHelperInvocation);
2843         }
validIDvkt::Reconvergence::__anone030def80111::FragmentRandomProgram::Arrangement2844         static bool validID(const uint32_t id, add_ref<uint32_t> subgroupID, add_ref<uint32_t> subgroupInvocationID,
2845                             add_ref<uint32_t> isHelperInvocation)
2846         {
2847             if (id != 0u)
2848             {
2849                 subgroupInvocationID = (id & 0xFFFF);
2850                 subgroupID           = ((id >> 16) & 0x7FFF) - 1u;
2851                 isHelperInvocation   = (id >> 31);
2852                 return true;
2853             }
2854             return false;
2855         }
2856     };
2857 
simulate(bool countOnly,uint32_t subgroupSize,add_ref<std::vector<uint64_t>> ref)2858     virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) override
2859     {
2860         DE_ASSERT(false); // use overloaded version of simulate() instead
2861         DE_UNREF(countOnly);
2862         DE_UNREF(subgroupSize);
2863         DE_UNREF(ref);
2864         return 0;
2865     }
2866 
2867     // Simulate execution of the program. If countOnly is true, just return
2868     // the max number of outputs written. If it's false, store out the result
2869     // values to ref.
execute(qpWatchDog * watchDog,bool countOnly,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,add_cref<std::vector<uint32_t>> outputP,const tcu::UVec4 * cmp=nullptr,const uint32_t reserved=(~0u))2870     virtual uint32_t execute(qpWatchDog *watchDog, bool countOnly, const uint32_t subgroupSize,
2871                              const uint32_t fragmentStride, const uint32_t primitiveStride,
2872                              add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2873                              add_cref<std::vector<uint32_t>> outputP, const tcu::UVec4 *cmp = nullptr,
2874                              const uint32_t reserved = (~0u)) override
2875     {
2876         DE_UNREF(reserved);
2877         uint32_t outLocs    = 0u;
2878         uint32_t maxOutLocs = 0u;
2879         for (uint32_t primitiveID = 0u; primitiveID < primitiveStride; ++primitiveID)
2880         {
2881             outLocs    = RandomProgram::execute(watchDog, countOnly, subgroupSize, fragmentStride, primitiveStride, ref,
2882                                                 log, outputP, cmp, primitiveID);
2883             maxOutLocs = std::max(outLocs, maxOutLocs);
2884         }
2885         return maxOutLocs;
2886     }
2887 
2888 protected:
simulateStore(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t primitiveID,const uint64_t storeValue,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)2889     virtual void simulateStore(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t primitiveID,
2890                                const uint64_t storeValue, add_ref<std::vector<uint32_t>> outLoc,
2891                                add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2892                                std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
2893                                const OPType reason, const tcu::UVec4 *cmp) override
2894     {
2895         uint32_t isHelperInvocation;
2896         add_cref<Arrangement> a(*std::static_pointer_cast<Arrangement>(prerequisites));
2897         for (const uint32_t id : a.m_fragmentSubgroups.at(primitiveID))
2898         {
2899             const uint32_t sgid = a.fqin(id, isHelperInvocation);
2900             if (sgid >= (a.m_subgroupCount * a.m_subgroupSize))
2901                 continue;
2902             if (false == activeMask.test(Ballots::findBit(sgid, a.m_subgroupSize)))
2903                 continue;
2904             const uint32_t loc   = primitiveID * a.m_subgroupCount * 128 + sgid;
2905             const uint32_t index = ((outLoc.at(loc)++) * (a.m_primitiveStride * a.m_subgroupCount * 128) +
2906                                     (primitiveID * a.m_subgroupCount * 128) + sgid);
2907             if (false == countOnly)
2908             {
2909                 ref.at(index) = tcu::UVec4(uint32_t(storeValue & 0xFFFFFFFF), 0u, 0u, 0u);
2910                 if (cmp && logFailureCount > 0u && cmp[index] != ref.at(index))
2911                 {
2912                     logFailureCount -= 1u;
2913                     log << tcu::TestLog::Message << logFailureCount << ": stored value mismatch from "
2914                         << OPtypeToStr(reason) << tcu::TestLog::EndMessage;
2915                 }
2916             }
2917         }
2918     }
2919 
simulateBallot(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t primitiveID,const int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)2920     virtual void simulateBallot(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t primitiveID,
2921                                 const int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
2922                                 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2923                                 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
2924                                 const OPType reason, const tcu::UVec4 *cmp) override
2925     {
2926         DE_UNREF(opsIndex);
2927         uint32_t isHelperInvocation;
2928         add_cref<Arrangement> a(*std::static_pointer_cast<Arrangement>(prerequisites));
2929         for (const uint32_t id : a.m_fragmentSubgroups.at(primitiveID))
2930         {
2931             const uint32_t sgid = a.fqin(id, isHelperInvocation);
2932             if (sgid >= (a.m_subgroupCount * a.m_subgroupSize))
2933                 continue;
2934             if (false == activeMask.test(Ballots::findBit(sgid, a.m_subgroupSize)))
2935                 continue;
2936             const uint32_t loc   = primitiveID * a.m_subgroupCount * 128 + sgid;
2937             const uint32_t index = ((outLoc.at(loc)++) * (a.m_primitiveStride * a.m_subgroupCount * 128) +
2938                                     (primitiveID * a.m_subgroupCount * 128) + sgid);
2939             if (false == countOnly)
2940             {
2941                 ref.at(index) = Ballot(activeMask.at(sgid / a.m_subgroupSize));
2942                 if (cmp && logFailureCount > 0u && cmp[index] != ref.at(index))
2943                 {
2944                     logFailureCount -= 1u;
2945                     log << tcu::TestLog::Message << logFailureCount << ": ballot mismatch from " << OPtypeToStr(reason)
2946                         << tcu::TestLog::EndMessage;
2947                 }
2948             }
2949         }
2950     }
2951 
makePrerequisites(add_cref<std::vector<uint32_t>> outputP,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<SubgroupState2>> stateStack,add_ref<std::vector<uint32_t>> outLoc,add_ref<uint32_t> subgroupCount)2952     virtual std::shared_ptr<Prerequisites> makePrerequisites(add_cref<std::vector<uint32_t>> outputP,
2953                                                              const uint32_t subgroupSize, const uint32_t fragmentStride,
2954                                                              const uint32_t primitiveStride,
2955                                                              add_ref<std::vector<SubgroupState2>> stateStack,
2956                                                              add_ref<std::vector<uint32_t>> outLoc,
2957                                                              add_ref<uint32_t> subgroupCount) override
2958     {
2959         auto prerequisites = std::make_shared<Arrangement>(outputP, fragmentStride, 1u, subgroupSize, primitiveStride);
2960         subgroupCount      = prerequisites->m_subgroupCount;
2961         stateStack.resize(10u, SubgroupState2(subgroupCount));
2962         outLoc.resize((subgroupCount * 128u * fragmentStride), 0u);
2963         stateStack.at(0).activeMask = prerequisites->m_initialBallots;
2964         return prerequisites;
2965     }
2966 };
2967 
2968 class VertexRandomProgram : public RandomProgram
2969 {
2970 public:
2971     static const constexpr uint32_t fillPercentage = 73u;
VertexRandomProgram(add_cref<CaseDef> c)2972     VertexRandomProgram(add_cref<CaseDef> c)
2973         : RandomProgram(c,
2974                         static_cast<uint32_t>(Arrangement::generatePrimitives(c.sizeX, c.sizeY, fillPercentage).size()))
2975     {
2976         DE_ASSERT(c.shaderStage == VK_SHADER_STAGE_VERTEX_BIT);
2977     }
2978     virtual ~VertexRandomProgram() = default;
2979 
2980     struct Arrangement : Prerequisites
2981     {
2982         static constexpr uint32_t NUM_SUBGROUPS_OFFSET      = 0u;
2983         static constexpr uint32_t SUBGROUP_SIZE_OFFSET      = 1u;
2984         static constexpr uint32_t INVOCATION_COUNT_OFFSET   = 2u;
2985         static constexpr uint32_t INVOCATION_ENTRIES_OFFSET = 3u;
2986 
2987         const uint32_t m_subgroupSize;
2988         const uint32_t m_primitiveStride;
2989         const uint32_t m_subgroupCount;
2990         const Ballots m_initialBallots;
2991         const uint32_t m_invocationStride;
2992         const std::vector<uint32_t> m_primitiveSubgroups;
Arrangementvkt::Reconvergence::__anone030def80111::VertexRandomProgram::Arrangement2993         Arrangement(add_cref<std::vector<uint32_t>> outputP, uint32_t subgroupSize, uint32_t primitiveStride)
2994             : m_subgroupSize(subgroupSize)
2995             , m_primitiveStride(primitiveStride)
2996             , m_subgroupCount(calcSubgroupCount(outputP))
2997             , m_initialBallots(makeInitialBallots(subgroupSize, primitiveStride, outputP))
2998             , m_invocationStride(primitiveStride)
2999             , m_primitiveSubgroups(makePrimitiveSubgroups(subgroupSize, primitiveStride, outputP))
3000         {
3001         }
calcSubgroupCountvkt::Reconvergence::__anone030def80111::VertexRandomProgram::Arrangement3002         static uint32_t calcSubgroupCount(add_cref<std::vector<uint32_t>> outputP)
3003         {
3004             return outputP.at(NUM_SUBGROUPS_OFFSET);
3005         }
calcSubgroupSizevkt::Reconvergence::__anone030def80111::VertexRandomProgram::Arrangement3006         static uint32_t calcSubgroupSize(add_cref<std::vector<uint32_t>> outputP)
3007         {
3008             return outputP.at(SUBGROUP_SIZE_OFFSET);
3009         }
calcSubgroupInvocationStridevkt::Reconvergence::__anone030def80111::VertexRandomProgram::Arrangement3010         static uint32_t calcSubgroupInvocationStride(add_cref<std::vector<uint32_t>> outputP)
3011         {
3012             return outputP.at(INVOCATION_COUNT_OFFSET);
3013         }
makeInitialBallotsvkt::Reconvergence::__anone030def80111::VertexRandomProgram::Arrangement3014         static Ballots makeInitialBallots(uint32_t subgroupSize, uint32_t primitiveStride,
3015                                           add_cref<std::vector<uint32_t>> outputP)
3016         {
3017             DE_UNREF(subgroupSize);
3018             const uint32_t subgroupCount = calcSubgroupCount(outputP);
3019             Ballots initialBallots(subgroupCount);
3020             for (uint32_t primitiveID = 0u; primitiveID < primitiveStride; ++primitiveID)
3021             {
3022                 const uint32_t id = outputP.at(primitiveID + INVOCATION_ENTRIES_OFFSET);
3023                 if (id)
3024                 {
3025                     const uint32_t subgroupID           = (id >> 16) - 1u;
3026                     const uint32_t subgroupInvocationID = id & 0xFFFF;
3027                     DE_ASSERT(subgroupID < subgroupCount);
3028                     DE_ASSERT(subgroupInvocationID < subgroupSize);
3029                     initialBallots.at(subgroupID).set(subgroupInvocationID);
3030                 }
3031             }
3032             return initialBallots;
3033         }
makePrimitiveSubgroupsvkt::Reconvergence::__anone030def80111::VertexRandomProgram::Arrangement3034         static std::vector<uint32_t> makePrimitiveSubgroups(uint32_t subgroupSize, uint32_t primitiveStride,
3035                                                             add_cref<std::vector<uint32_t>> outputP)
3036         {
3037             std::vector<uint32_t> map(primitiveStride);
3038             for (uint32_t primitiveID = 0u; primitiveID < primitiveStride; ++primitiveID)
3039             {
3040                 const uint32_t id = outputP.at(primitiveID + INVOCATION_ENTRIES_OFFSET);
3041                 if (id)
3042                 {
3043                     const uint32_t subgroupID           = (id >> 16) - 1u;
3044                     const uint32_t subgroupInvocationID = id & 0xFFFF;
3045                     DE_ASSERT(subgroupInvocationID < subgroupSize);
3046                     map.at(primitiveID) = subgroupID * subgroupSize + subgroupInvocationID;
3047                 }
3048             }
3049             return map;
3050         }
generatePrimitivesvkt::Reconvergence::__anone030def80111::VertexRandomProgram::Arrangement3051         static std::vector<tcu::Vec4> generatePrimitives(uint32_t width, uint32_t height, uint32_t fillPercent)
3052         {
3053             deRandom rnd;
3054             std::map<uint32_t, int> map;
3055             std::vector<tcu::Vec4> points;
3056             const uint32_t frags = (width * height);
3057             const uint32_t total = (frags * fillPercent) / 100u;
3058 
3059             deRandom_init(&rnd, (width * height));
3060 
3061             for (uint32_t i = 0u; i < total; ++i)
3062             {
3063                 const uint32_t r = deRandom_getUint32(&rnd) % frags;
3064                 if (map[r] != 0)
3065                 {
3066                     i -= 1;
3067                     continue;
3068                 }
3069                 map[r] = 1;
3070 
3071                 uint32_t y = r / width;
3072                 uint32_t x = r % width;
3073                 float xx   = (float(x) + float(x + 1)) / (2.0f * float(width));
3074                 float yy   = (float(y) + float(y + 1)) / (2.0f * float(height));
3075                 float xxx  = xx * 2.0f - 1.0f;
3076                 float yyy  = yy * 2.0f - 1.0f;
3077                 points.emplace_back(tcu::Vec4(xxx, yyy, 0u, 0u));
3078             }
3079             return points;
3080         }
generateOutputPvectorvkt::Reconvergence::__anone030def80111::VertexRandomProgram::Arrangement3081         static std::vector<uint32_t> generateOutputPvector(uint32_t subgroupSize, uint32_t vertexCount)
3082         {
3083             const uint32_t subgroupCount = ROUNDUP(vertexCount, subgroupSize) / subgroupSize;
3084             std::vector<uint32_t> outputP(vertexCount + INVOCATION_ENTRIES_OFFSET);
3085             outputP.at(NUM_SUBGROUPS_OFFSET)    = subgroupCount;
3086             outputP.at(SUBGROUP_SIZE_OFFSET)    = subgroupSize;
3087             outputP.at(INVOCATION_COUNT_OFFSET) = vertexCount;
3088             for (uint32_t vertexID = 0u; vertexID < vertexCount; ++vertexID)
3089             {
3090                 const uint32_t subgroupID                        = vertexID / subgroupSize;
3091                 const uint32_t subgroupInvocationID              = vertexID % subgroupSize;
3092                 outputP.at(vertexID + INVOCATION_ENTRIES_OFFSET) = ((subgroupID + 1u) << 16) | subgroupInvocationID;
3093             }
3094             return outputP;
3095         }
3096     };
3097 
simulate(bool countOnly,uint32_t subgroupSize,add_ref<std::vector<uint64_t>> ref)3098     virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) override
3099     {
3100         DE_ASSERT(false); // use overloaded version of simulate() instead
3101         DE_UNREF(countOnly);
3102         DE_UNREF(subgroupSize);
3103         DE_UNREF(ref);
3104         return 0;
3105     }
3106 
3107 protected:
genIf(IFType ifType,uint32_t)3108     virtual void genIf(IFType ifType, uint32_t /*maxLocalIndexCmp*/) override
3109     {
3110         RandomProgram::genIf(ifType, RandomProgram::invocationStride);
3111     }
3112 
getPartitionBallotText()3113     virtual std::string getPartitionBallotText() override
3114     {
3115         return "storeValue(outLoc++, subgroupBallot(true))";
3116     }
3117 
printIfLocalInvocationIndex(add_ref<std::stringstream> css,add_cref<FlowState> flow)3118     virtual void printIfLocalInvocationIndex(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
3119     {
3120         printIndent(css);
3121         css << "if (invocationIndex() >= inputA.a[0x" << std::hex << flow.ops[flow.opsIndex].value << "]) {\n";
3122     }
3123 
printStore(add_ref<std::stringstream> css,add_cref<FlowState> flow)3124     virtual void printStore(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
3125     {
3126         printIndent(css);
3127         css << "storeValue(outLoc++, 0x" << std::hex << flow.ops[flow.opsIndex].value << std::dec << ");\n";
3128     }
3129 
printBallot(add_ref<std::stringstream> css,add_cref<FlowState>,bool endWithSemicolon=false)3130     virtual void printBallot(add_ref<std::stringstream> css, add_cref<FlowState>,
3131                              bool endWithSemicolon = false) override
3132     {
3133         printIndent(css);
3134         // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
3135         // a ballot, to make sure the ballot is "diverged enough". Don't do this for
3136         // subgroup_uniform_control_flow, since we only validate results that must be fully
3137         // reconverged.
3138         if (loopNesting > 0 && caseDef.testType == TT_MAXIMAL)
3139         {
3140             css << getPartitionBallotText();
3141         }
3142         else
3143         {
3144             css << "storeValue(outLoc++, subgroupBallot(true))";
3145         }
3146         if (endWithSemicolon)
3147         {
3148             css << ";\n";
3149         }
3150     }
3151 
simulateBallot(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t unusedPrimitiveID,const int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)3152     virtual void simulateBallot(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t unusedPrimitiveID,
3153                                 const int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
3154                                 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
3155                                 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
3156                                 const OPType reason, const tcu::UVec4 *cmp) override
3157     {
3158         DE_UNREF(unusedPrimitiveID);
3159         DE_UNREF(opsIndex);
3160         add_cref<Arrangement> a(*std::static_pointer_cast<Arrangement>(prerequisites));
3161         for (uint32_t primitiveID = 0u; primitiveID < a.m_primitiveStride; ++primitiveID)
3162         {
3163             const uint32_t sgid = a.m_primitiveSubgroups.at(primitiveID);
3164             DE_ASSERT(sgid < (a.m_subgroupCount * a.m_subgroupSize));
3165             if (false == activeMask.test(Ballots::findBit(sgid, a.m_subgroupSize)))
3166                 continue;
3167             const uint32_t index = (outLoc.at(primitiveID)++) * a.m_invocationStride + primitiveID;
3168             if (false == countOnly)
3169             {
3170                 ref.at(index) = Ballot(activeMask.at(sgid / a.m_subgroupSize));
3171                 if (cmp && logFailureCount > 0u && cmp[index] != ref.at(index))
3172                 {
3173                     logFailureCount -= 1u;
3174                     log << tcu::TestLog::Message << logFailureCount << ": stored value mismatch from "
3175                         << OPtypeToStr(reason) << tcu::TestLog::EndMessage;
3176                 }
3177             }
3178         }
3179     }
3180 
simulateStore(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t unusedPrimitiveID,const uint64_t storeValue,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)3181     virtual void simulateStore(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t unusedPrimitiveID,
3182                                const uint64_t storeValue, add_ref<std::vector<uint32_t>> outLoc,
3183                                add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
3184                                std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
3185                                const OPType reason, const tcu::UVec4 *cmp) override
3186     {
3187         DE_UNREF(unusedPrimitiveID);
3188         add_cref<Arrangement> a(*std::static_pointer_cast<Arrangement>(prerequisites));
3189         for (uint32_t primitiveID = 0u; primitiveID < a.m_primitiveStride; ++primitiveID)
3190         {
3191             const uint32_t sgid = a.m_primitiveSubgroups.at(primitiveID);
3192             DE_ASSERT(sgid < (a.m_subgroupCount * a.m_subgroupSize));
3193             if (false == activeMask.test(Ballots::findBit(sgid, a.m_subgroupSize)))
3194                 continue;
3195             const uint32_t index = (outLoc.at(primitiveID)++) * a.m_invocationStride + primitiveID;
3196             if (false == countOnly)
3197             {
3198                 ref.at(index) = Ballot(tcu::UVec4(uint32_t(storeValue & 0xFFFFFFFF), 0u, 0u, 0u));
3199                 if (cmp && logFailureCount > 0u && cmp[index] != ref.at(index))
3200                 {
3201                     logFailureCount -= 1u;
3202                     log << tcu::TestLog::Message << logFailureCount << ": stored value mismatch from "
3203                         << OPtypeToStr(reason) << tcu::TestLog::EndMessage;
3204                 }
3205             }
3206         }
3207     }
3208 
makePrerequisites(add_cref<std::vector<uint32_t>> outputP,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<SubgroupState2>> stateStack,add_ref<std::vector<uint32_t>> outLoc,add_ref<uint32_t> subgroupCount)3209     virtual std::shared_ptr<Prerequisites> makePrerequisites(add_cref<std::vector<uint32_t>> outputP,
3210                                                              const uint32_t subgroupSize, const uint32_t fragmentStride,
3211                                                              const uint32_t primitiveStride,
3212                                                              add_ref<std::vector<SubgroupState2>> stateStack,
3213                                                              add_ref<std::vector<uint32_t>> outLoc,
3214                                                              add_ref<uint32_t> subgroupCount) override
3215     {
3216         DE_UNREF(fragmentStride);
3217         auto prerequisites = std::make_shared<Arrangement>(outputP, subgroupSize, primitiveStride);
3218         subgroupCount      = prerequisites->m_subgroupCount;
3219         stateStack.resize(10u, SubgroupState2(subgroupCount));
3220         outLoc.resize(primitiveStride, 0u);
3221         stateStack.at(0).activeMask = prerequisites->m_initialBallots;
3222         return prerequisites;
3223     }
3224 };
3225 
3226 class TessCtrlRandomProgram : public RandomProgram
3227 {
3228 public:
TessCtrlRandomProgram(add_cref<CaseDef> c,uint32_t invocationCount)3229     TessCtrlRandomProgram(add_cref<CaseDef> c, uint32_t invocationCount) : RandomProgram(c, invocationCount)
3230     {
3231         DE_ASSERT(c.shaderStage == VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT);
3232     }
3233     virtual ~TessCtrlRandomProgram() = default;
3234 
3235     static const uint32_t minSubgroupSize = 4;
3236 
genIf(IFType ifType,uint32_t)3237     virtual void genIf(IFType ifType, uint32_t /*maxLocalIndexCmp*/) override
3238     {
3239         RandomProgram::genIf(ifType, std::min((minSubgroupSize * caseDef.sizeX), 64u));
3240     }
3241 
printIfLocalInvocationIndex(add_ref<std::stringstream> css,add_cref<FlowState> flow)3242     virtual void printIfLocalInvocationIndex(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
3243     {
3244         printIndent(css);
3245         css << "if (";
3246         css << "((((gl_PrimitiveID * width) / gl_SubgroupSize) * gl_SubgroupSize) + gl_SubgroupInvocationID)";
3247         css << " >= inputA.a[0x" << std::hex << flow.ops[flow.opsIndex].value << "]) {\n";
3248     }
3249 
printStore(add_ref<std::stringstream> css,add_cref<FlowState> flow)3250     virtual void printStore(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
3251     {
3252         printIndent(css);
3253         css << "outputC.loc[invocationIndex()]++;\n";
3254         printIndent(css);
3255         css << "outputB.b[(outLoc++) * invocationStride + invocationIndex()].x = 0x" << std::hex
3256             << flow.ops[flow.opsIndex].value << ";\n";
3257     }
3258 
printBallot(add_ref<std::stringstream> css,add_cref<FlowState>,bool endWithSemicolon=false)3259     virtual void printBallot(add_ref<std::stringstream> css, add_cref<FlowState>,
3260                              bool endWithSemicolon = false) override
3261     {
3262         printIndent(css);
3263 
3264         css << "outputC.loc[invocationIndex()]++,";
3265         // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
3266         // a ballot, to make sure the ballot is "diverged enough". Don't do this for
3267         // subgroup_uniform_control_flow, since we only validate results that must be fully
3268         // reconverged.
3269         if (loopNesting > 0 && caseDef.testType == TT_MAXIMAL)
3270         {
3271             css << "outputB.b[(outLoc++) * invocationStride + invocationIndex()] = " << getPartitionBallotText()
3272                 << ".xy";
3273         }
3274         else
3275         {
3276             css << "outputB.b[(outLoc++) * invocationStride + invocationIndex()] = subgroupBallot(true).xy";
3277         }
3278         if (endWithSemicolon)
3279         {
3280             css << ";\n";
3281         }
3282     }
3283 
simulateStoreToChange(bool countOnly,uint32_t,const SubgroupState (& stateStack)[10],int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<uint64_t>> ref)3284     void simulateStoreToChange(bool countOnly, uint32_t /*subgroupSize*/, const SubgroupState (&stateStack)[10],
3285                                int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
3286                                add_ref<std::vector<uint64_t>> ref)
3287     {
3288         for (uint32_t id = 0; id < invocationStride; ++id)
3289         {
3290             if (stateStack[nesting].activeMask.test(id))
3291             {
3292                 if (countOnly)
3293                     outLoc[id]++;
3294                 else
3295                     ref[(outLoc[id]++) * invocationStride + id] = ops[opsIndex].value;
3296             }
3297         }
3298     }
3299 
simulateBallotToChange(bool countOnly,uint32_t subgroupSize,const SubgroupState (& stateStack)[10],uint32_t,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<uint64_t>> ref)3300     void simulateBallotToChange(bool countOnly, uint32_t subgroupSize, const SubgroupState (&stateStack)[10],
3301                                 uint32_t /*opsIndex*/, add_ref<std::vector<uint32_t>> outLoc,
3302                                 add_ref<std::vector<uint64_t>> ref)
3303     {
3304         for (uint32_t id = 0; id < invocationStride; ++id)
3305         {
3306             if (stateStack[nesting].activeMask.test(id))
3307             {
3308                 if (countOnly)
3309                     outLoc[id]++;
3310                 else
3311                     ref[(outLoc[id]++) * invocationStride + id] =
3312                         bitsetToU64(stateStack[nesting].activeMask, subgroupSize, id);
3313             }
3314         }
3315     }
3316 
3317     // Simulate execution of the program. If countOnly is true, just return
3318     // the max number of outputs written. If it's false, store out the result
3319     // values to ref.
simulate(bool countOnly,uint32_t subgroupSize,add_ref<std::vector<uint64_t>> ref)3320     virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) override
3321     {
3322         SubgroupState stateStack[10];
3323         deMemset(&stateStack, 0, sizeof(stateStack));
3324 
3325         // Per-invocation output location counters
3326         std::vector<uint32_t> outLoc(invocationStride, 0u);
3327 
3328         nesting     = 0;
3329         loopNesting = 0;
3330 
3331         for (uint32_t k = 0; k < invocationStride; ++k)
3332             stateStack[nesting].activeMask.set(k);
3333 
3334         int32_t i = 0;
3335         while (i < (int32_t)ops.size())
3336         {
3337             switch (ops[i].type)
3338             {
3339             case OP_BALLOT:
3340                 simulateBallotToChange(countOnly, subgroupSize, stateStack, i, outLoc, ref);
3341                 break;
3342             case OP_STORE:
3343                 simulateStoreToChange(countOnly, subgroupSize, stateStack, i, outLoc, ref);
3344                 break;
3345             case OP_IF_MASK:
3346                 nesting++;
3347                 stateStack[nesting].activeMask =
3348                     stateStack[nesting - 1].activeMask & bitsetFromU64(ops[i].value, subgroupSize);
3349                 stateStack[nesting].header   = i;
3350                 stateStack[nesting].isLoop   = 0;
3351                 stateStack[nesting].isSwitch = 0;
3352                 break;
3353             case OP_ELSE_MASK:
3354                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask &
3355                                                  ~bitsetFromU64(ops[stateStack[nesting].header].value, subgroupSize);
3356                 break;
3357             case OP_IF_LOOPCOUNT:
3358             {
3359                 uint32_t n = nesting;
3360                 while (!stateStack[n].isLoop)
3361                     n--;
3362 
3363                 nesting++;
3364                 stateStack[nesting].activeMask =
3365                     stateStack[nesting - 1].activeMask & bitsetFromU64((1ULL << stateStack[n].tripCount), subgroupSize);
3366                 stateStack[nesting].header   = i;
3367                 stateStack[nesting].isLoop   = 0;
3368                 stateStack[nesting].isSwitch = 0;
3369                 break;
3370             }
3371             case OP_ELSE_LOOPCOUNT:
3372             {
3373                 uint32_t n = nesting;
3374                 while (!stateStack[n].isLoop)
3375                     n--;
3376 
3377                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask &
3378                                                  ~bitsetFromU64((1ULL << stateStack[n].tripCount), subgroupSize);
3379                 break;
3380             }
3381             case OP_IF_LOCAL_INVOCATION_INDEX: // TessCtrlRandomProgram
3382             {
3383                 // all bits >= N
3384                 bitset_inv_t mask;
3385                 for (uint32_t j = static_cast<uint32_t>(ops[i].value); j < invocationStride; ++j)
3386                     mask.set(j);
3387 
3388                 nesting++;
3389                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask & mask;
3390                 stateStack[nesting].header     = i;
3391                 stateStack[nesting].isLoop     = 0;
3392                 stateStack[nesting].isSwitch   = 0;
3393                 break;
3394             }
3395             case OP_ELSE_LOCAL_INVOCATION_INDEX: // TessCtrlRandomProgram
3396             {
3397                 // all bits < N
3398                 bitset_inv_t mask;
3399                 for (uint32_t j = 0; j < static_cast<uint32_t>(ops[i].value); ++j)
3400                     mask.set(j);
3401 
3402                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask & mask;
3403                 break;
3404             }
3405             case OP_ENDIF:
3406                 nesting--;
3407                 break;
3408             case OP_BEGIN_FOR_UNIF:
3409                 // XXX TODO: We don't handle a for loop with zero iterations
3410                 nesting++;
3411                 loopNesting++;
3412                 stateStack[nesting].activeMask   = stateStack[nesting - 1].activeMask;
3413                 stateStack[nesting].header       = i;
3414                 stateStack[nesting].tripCount    = 0;
3415                 stateStack[nesting].isLoop       = 1;
3416                 stateStack[nesting].isSwitch     = 0;
3417                 stateStack[nesting].continueMask = 0;
3418                 break;
3419             case OP_END_FOR_UNIF:
3420                 stateStack[nesting].tripCount++;
3421                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3422                 stateStack[nesting].continueMask = 0;
3423                 if (stateStack[nesting].tripCount < ops[stateStack[nesting].header].value &&
3424                     stateStack[nesting].activeMask.any())
3425                 {
3426                     i = stateStack[nesting].header + 1;
3427                     continue;
3428                 }
3429                 else
3430                 {
3431                     loopNesting--;
3432                     nesting--;
3433                 }
3434                 break;
3435             case OP_BEGIN_DO_WHILE_UNIF:
3436                 // XXX TODO: We don't handle a for loop with zero iterations
3437                 nesting++;
3438                 loopNesting++;
3439                 stateStack[nesting].activeMask   = stateStack[nesting - 1].activeMask;
3440                 stateStack[nesting].header       = i;
3441                 stateStack[nesting].tripCount    = 1;
3442                 stateStack[nesting].isLoop       = 1;
3443                 stateStack[nesting].isSwitch     = 0;
3444                 stateStack[nesting].continueMask = 0;
3445                 break;
3446             case OP_END_DO_WHILE_UNIF:
3447                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3448                 stateStack[nesting].continueMask = 0;
3449                 if (stateStack[nesting].tripCount < ops[stateStack[nesting].header].value &&
3450                     stateStack[nesting].activeMask.any())
3451                 {
3452                     i = stateStack[nesting].header + 1;
3453                     stateStack[nesting].tripCount++;
3454                     continue;
3455                 }
3456                 else
3457                 {
3458                     loopNesting--;
3459                     nesting--;
3460                 }
3461                 break;
3462             case OP_BEGIN_FOR_VAR:
3463                 // XXX TODO: We don't handle a for loop with zero iterations
3464                 nesting++;
3465                 loopNesting++;
3466                 stateStack[nesting].activeMask   = stateStack[nesting - 1].activeMask;
3467                 stateStack[nesting].header       = i;
3468                 stateStack[nesting].tripCount    = 0;
3469                 stateStack[nesting].isLoop       = 1;
3470                 stateStack[nesting].isSwitch     = 0;
3471                 stateStack[nesting].continueMask = 0;
3472                 break;
3473             case OP_END_FOR_VAR:
3474                 stateStack[nesting].tripCount++;
3475                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3476                 stateStack[nesting].continueMask = 0;
3477                 stateStack[nesting].activeMask &= bitsetFromU64(stateStack[nesting].tripCount == subgroupSize ?
3478                                                                     0 :
3479                                                                     ~((1ULL << (stateStack[nesting].tripCount)) - 1),
3480                                                                 subgroupSize);
3481                 if (stateStack[nesting].activeMask.any())
3482                 {
3483                     i = stateStack[nesting].header + 1;
3484                     continue;
3485                 }
3486                 else
3487                 {
3488                     loopNesting--;
3489                     nesting--;
3490                 }
3491                 break;
3492             case OP_BEGIN_FOR_INF:
3493             case OP_BEGIN_DO_WHILE_INF:
3494                 nesting++;
3495                 loopNesting++;
3496                 stateStack[nesting].activeMask   = stateStack[nesting - 1].activeMask;
3497                 stateStack[nesting].header       = i;
3498                 stateStack[nesting].tripCount    = 0;
3499                 stateStack[nesting].isLoop       = 1;
3500                 stateStack[nesting].isSwitch     = 0;
3501                 stateStack[nesting].continueMask = 0;
3502                 break;
3503             case OP_END_FOR_INF:
3504                 stateStack[nesting].tripCount++;
3505                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3506                 stateStack[nesting].continueMask = 0;
3507                 if (stateStack[nesting].activeMask.any())
3508                 {
3509                     // output expected OP_BALLOT values
3510                     simulateBallotToChange(countOnly, subgroupSize, stateStack, i, outLoc, ref);
3511 
3512                     i = stateStack[nesting].header + 1;
3513                     continue;
3514                 }
3515                 else
3516                 {
3517                     loopNesting--;
3518                     nesting--;
3519                 }
3520                 break;
3521             case OP_END_DO_WHILE_INF:
3522                 stateStack[nesting].tripCount++;
3523                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3524                 stateStack[nesting].continueMask = 0;
3525                 if (stateStack[nesting].activeMask.any())
3526                 {
3527                     i = stateStack[nesting].header + 1;
3528                     continue;
3529                 }
3530                 else
3531                 {
3532                     loopNesting--;
3533                     nesting--;
3534                 }
3535                 break;
3536             case OP_BREAK:
3537             {
3538                 uint32_t n        = nesting;
3539                 bitset_inv_t mask = stateStack[nesting].activeMask;
3540                 while (true)
3541                 {
3542                     stateStack[n].activeMask &= ~mask;
3543                     if (stateStack[n].isLoop || stateStack[n].isSwitch)
3544                         break;
3545 
3546                     n--;
3547                 }
3548             }
3549             break;
3550             case OP_CONTINUE:
3551             {
3552                 uint32_t n        = nesting;
3553                 bitset_inv_t mask = stateStack[nesting].activeMask;
3554                 while (true)
3555                 {
3556                     stateStack[n].activeMask &= ~mask;
3557                     if (stateStack[n].isLoop)
3558                     {
3559                         stateStack[n].continueMask |= mask;
3560                         break;
3561                     }
3562                     n--;
3563                 }
3564             }
3565             break;
3566             case OP_ELECT:
3567             {
3568                 nesting++;
3569                 stateStack[nesting].activeMask = bitsetElect(stateStack[nesting - 1].activeMask, subgroupSize);
3570                 stateStack[nesting].header     = i;
3571                 stateStack[nesting].isLoop     = 0;
3572                 stateStack[nesting].isSwitch   = 0;
3573             }
3574             break;
3575             case OP_RETURN:
3576             {
3577                 bitset_inv_t mask = stateStack[nesting].activeMask;
3578                 for (int32_t n = nesting; n >= 0; --n)
3579                 {
3580                     stateStack[n].activeMask &= ~mask;
3581                     if (stateStack[n].isCall)
3582                         break;
3583                 }
3584             }
3585             break;
3586 
3587             case OP_CALL_BEGIN:
3588                 nesting++;
3589                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3590                 stateStack[nesting].isLoop     = 0;
3591                 stateStack[nesting].isSwitch   = 0;
3592                 stateStack[nesting].isCall     = 1;
3593                 break;
3594             case OP_CALL_END:
3595                 stateStack[nesting].isCall = 0;
3596                 nesting--;
3597                 break;
3598             case OP_NOISE:
3599                 break;
3600 
3601             case OP_SWITCH_UNIF_BEGIN:
3602             case OP_SWITCH_VAR_BEGIN:
3603             case OP_SWITCH_LOOP_COUNT_BEGIN:
3604                 nesting++;
3605                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3606                 stateStack[nesting].header     = i;
3607                 stateStack[nesting].isLoop     = 0;
3608                 stateStack[nesting].isSwitch   = 1;
3609                 break;
3610             case OP_SWITCH_END:
3611                 nesting--;
3612                 break;
3613             case OP_CASE_MASK_BEGIN:
3614                 stateStack[nesting].activeMask =
3615                     stateStack[nesting - 1].activeMask & bitsetFromU64(ops[i].value, subgroupSize);
3616                 break;
3617             case OP_CASE_LOOP_COUNT_BEGIN:
3618             {
3619                 uint32_t n = nesting;
3620                 uint32_t l = loopNesting;
3621 
3622                 while (true)
3623                 {
3624                     if (stateStack[n].isLoop)
3625                     {
3626                         l--;
3627                         if (l == ops[stateStack[nesting].header].value)
3628                             break;
3629                     }
3630                     n--;
3631                 }
3632 
3633                 if ((1ULL << stateStack[n].tripCount) & ops[i].value)
3634                     stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3635                 else
3636                     stateStack[nesting].activeMask = 0;
3637                 break;
3638             }
3639             case OP_CASE_END:
3640                 break;
3641 
3642             default:
3643                 DE_ASSERT(0);
3644                 break;
3645             }
3646             i++;
3647         }
3648         uint32_t maxLoc = 0;
3649         for (uint32_t id = 0; id < (uint32_t)outLoc.size(); ++id)
3650             maxLoc = de::max(maxLoc, outLoc[id]);
3651 
3652         return maxLoc;
3653     }
3654 };
3655 
3656 class TessEvalRandomProgram : public RandomProgram
3657 {
3658 public:
TessEvalRandomProgram(add_cref<CaseDef> c,uint32_t invocationCount=0)3659     TessEvalRandomProgram(add_cref<CaseDef> c, uint32_t invocationCount = 0)
3660         : RandomProgram(c, (invocationCount ? invocationCount : 64))
3661         , ifLocalInvocationIndexAsSubgroupInvocationID(false)
3662     {
3663         DE_ASSERT(c.shaderStage == VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT);
3664     }
3665     virtual ~TessEvalRandomProgram() = default;
3666 
3667     const bool ifLocalInvocationIndexAsSubgroupInvocationID;
3668     static const uint32_t quadInvocationCount = 4;
3669 
3670     // Simulate execution of the program. If countOnly is true, just return
3671     // the max number of outputs written. If it's false, store out the result
3672     // values to ref.
simulate(bool countOnly,uint32_t subgroupSize,add_ref<std::vector<uint64_t>> ref)3673     virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) override
3674     {
3675         SubgroupState stateStack[10];
3676         deMemset(&stateStack, 0, sizeof(stateStack));
3677 
3678         // Per-invocation output location counters
3679         std::vector<uint32_t> outLoc(invocationStride, 0u);
3680 
3681         nesting     = 0;
3682         loopNesting = 0;
3683 
3684         for (uint32_t k = 0; k < invocationStride; ++k)
3685             stateStack[nesting].activeMask.set(k);
3686 
3687         int32_t i = 0;
3688         while (i < (int32_t)ops.size())
3689         {
3690             switch (ops[i].type)
3691             {
3692             case OP_BALLOT:
3693                 simulateBallotToChange(countOnly, subgroupSize, stateStack, i, outLoc, ref);
3694                 break;
3695             case OP_STORE:
3696                 simulateStoreToChange(countOnly, subgroupSize, stateStack, i, outLoc, ref);
3697                 break;
3698             case OP_IF_MASK:
3699                 nesting++;
3700                 stateStack[nesting].activeMask =
3701                     stateStack[nesting - 1].activeMask & bitsetFromU64(ops[i].value, subgroupSize);
3702                 stateStack[nesting].header   = i;
3703                 stateStack[nesting].isLoop   = 0;
3704                 stateStack[nesting].isSwitch = 0;
3705                 break;
3706             case OP_ELSE_MASK:
3707                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask &
3708                                                  ~bitsetFromU64(ops[stateStack[nesting].header].value, subgroupSize);
3709                 break;
3710             case OP_IF_LOOPCOUNT:
3711             {
3712                 uint32_t n = nesting;
3713                 while (!stateStack[n].isLoop)
3714                     n--;
3715 
3716                 nesting++;
3717                 stateStack[nesting].activeMask =
3718                     stateStack[nesting - 1].activeMask & bitsetFromU64((1ULL << stateStack[n].tripCount), subgroupSize);
3719                 stateStack[nesting].header   = i;
3720                 stateStack[nesting].isLoop   = 0;
3721                 stateStack[nesting].isSwitch = 0;
3722                 break;
3723             }
3724             case OP_ELSE_LOOPCOUNT:
3725             {
3726                 uint32_t n = nesting;
3727                 while (!stateStack[n].isLoop)
3728                     n--;
3729 
3730                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask &
3731                                                  ~bitsetFromU64((1ULL << stateStack[n].tripCount), subgroupSize);
3732                 break;
3733             }
3734             case OP_IF_LOCAL_INVOCATION_INDEX: // TessEvalRandomProgram
3735             {
3736                 bitset_inv_t mask;
3737                 if (ifLocalInvocationIndexAsSubgroupInvocationID)
3738                 {
3739                     // if (gl_SubgroupInvocationID >= value), all bits >= N
3740                     for (uint32_t j = static_cast<uint32_t>(ops[i].value); j < subgroupSize; ++j)
3741                         mask.set(j);
3742                     mask = bitsetFromU64(mask.to_ullong(), subgroupSize);
3743                 }
3744                 else
3745                 {
3746                     // all bits >= N
3747                     for (uint32_t j = (uint32_t)ops[i].value; j < invocationStride; ++j)
3748                         mask.set(j);
3749                 }
3750 
3751                 nesting++;
3752                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask & mask;
3753                 stateStack[nesting].header     = i;
3754                 stateStack[nesting].isLoop     = 0;
3755                 stateStack[nesting].isSwitch   = 0;
3756                 break;
3757             }
3758             case OP_ELSE_LOCAL_INVOCATION_INDEX: // TessEvalRandomProgram
3759             {
3760                 // all bits < N
3761                 bitset_inv_t mask;
3762                 for (uint32_t j = 0; j < static_cast<uint32_t>(ops[i].value); ++j)
3763                     mask.set(j);
3764 
3765                 if (ifLocalInvocationIndexAsSubgroupInvocationID)
3766                 {
3767                     // else (gl_SubgroupInvocationID >= value), all bits < N
3768                     mask = bitsetFromU64(mask.to_ullong(), subgroupSize);
3769                 }
3770 
3771                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask & mask;
3772                 break;
3773             }
3774             case OP_ENDIF:
3775                 nesting--;
3776                 break;
3777             case OP_BEGIN_FOR_UNIF:
3778                 // XXX TODO: We don't handle a for loop with zero iterations
3779                 nesting++;
3780                 loopNesting++;
3781                 stateStack[nesting].activeMask   = stateStack[nesting - 1].activeMask;
3782                 stateStack[nesting].header       = i;
3783                 stateStack[nesting].tripCount    = 0;
3784                 stateStack[nesting].isLoop       = 1;
3785                 stateStack[nesting].isSwitch     = 0;
3786                 stateStack[nesting].continueMask = 0;
3787                 break;
3788             case OP_END_FOR_UNIF:
3789                 stateStack[nesting].tripCount++;
3790                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3791                 stateStack[nesting].continueMask = 0;
3792                 if (stateStack[nesting].tripCount < ops[stateStack[nesting].header].value &&
3793                     stateStack[nesting].activeMask.any())
3794                 {
3795                     i = stateStack[nesting].header + 1;
3796                     continue;
3797                 }
3798                 else
3799                 {
3800                     loopNesting--;
3801                     nesting--;
3802                 }
3803                 break;
3804             case OP_BEGIN_DO_WHILE_UNIF:
3805                 // XXX TODO: We don't handle a for loop with zero iterations
3806                 nesting++;
3807                 loopNesting++;
3808                 stateStack[nesting].activeMask   = stateStack[nesting - 1].activeMask;
3809                 stateStack[nesting].header       = i;
3810                 stateStack[nesting].tripCount    = 1;
3811                 stateStack[nesting].isLoop       = 1;
3812                 stateStack[nesting].isSwitch     = 0;
3813                 stateStack[nesting].continueMask = 0;
3814                 break;
3815             case OP_END_DO_WHILE_UNIF:
3816                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3817                 stateStack[nesting].continueMask = 0;
3818                 if (stateStack[nesting].tripCount < ops[stateStack[nesting].header].value &&
3819                     stateStack[nesting].activeMask.any())
3820                 {
3821                     i = stateStack[nesting].header + 1;
3822                     stateStack[nesting].tripCount++;
3823                     continue;
3824                 }
3825                 else
3826                 {
3827                     loopNesting--;
3828                     nesting--;
3829                 }
3830                 break;
3831             case OP_BEGIN_FOR_VAR:
3832                 // XXX TODO: We don't handle a for loop with zero iterations
3833                 nesting++;
3834                 loopNesting++;
3835                 stateStack[nesting].activeMask   = stateStack[nesting - 1].activeMask;
3836                 stateStack[nesting].header       = i;
3837                 stateStack[nesting].tripCount    = 0;
3838                 stateStack[nesting].isLoop       = 1;
3839                 stateStack[nesting].isSwitch     = 0;
3840                 stateStack[nesting].continueMask = 0;
3841                 break;
3842             case OP_END_FOR_VAR:
3843                 stateStack[nesting].tripCount++;
3844                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3845                 stateStack[nesting].continueMask = 0;
3846                 stateStack[nesting].activeMask &= bitsetFromU64(stateStack[nesting].tripCount == subgroupSize ?
3847                                                                     0 :
3848                                                                     ~((1ULL << (stateStack[nesting].tripCount)) - 1),
3849                                                                 subgroupSize);
3850                 if (stateStack[nesting].activeMask.any())
3851                 {
3852                     i = stateStack[nesting].header + 1;
3853                     continue;
3854                 }
3855                 else
3856                 {
3857                     loopNesting--;
3858                     nesting--;
3859                 }
3860                 break;
3861             case OP_BEGIN_FOR_INF:
3862             case OP_BEGIN_DO_WHILE_INF:
3863                 nesting++;
3864                 loopNesting++;
3865                 stateStack[nesting].activeMask   = stateStack[nesting - 1].activeMask;
3866                 stateStack[nesting].header       = i;
3867                 stateStack[nesting].tripCount    = 0;
3868                 stateStack[nesting].isLoop       = 1;
3869                 stateStack[nesting].isSwitch     = 0;
3870                 stateStack[nesting].continueMask = 0;
3871                 break;
3872             case OP_END_FOR_INF:
3873                 stateStack[nesting].tripCount++;
3874                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3875                 stateStack[nesting].continueMask = 0;
3876                 if (stateStack[nesting].activeMask.any())
3877                 {
3878                     // output expected OP_BALLOT values
3879                     simulateBallotToChange(countOnly, subgroupSize, stateStack, i, outLoc, ref);
3880 
3881                     i = stateStack[nesting].header + 1;
3882                     continue;
3883                 }
3884                 else
3885                 {
3886                     loopNesting--;
3887                     nesting--;
3888                 }
3889                 break;
3890             case OP_END_DO_WHILE_INF:
3891                 stateStack[nesting].tripCount++;
3892                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3893                 stateStack[nesting].continueMask = 0;
3894                 if (stateStack[nesting].activeMask.any())
3895                 {
3896                     i = stateStack[nesting].header + 1;
3897                     continue;
3898                 }
3899                 else
3900                 {
3901                     loopNesting--;
3902                     nesting--;
3903                 }
3904                 break;
3905             case OP_BREAK:
3906             {
3907                 uint32_t n        = nesting;
3908                 bitset_inv_t mask = stateStack[nesting].activeMask;
3909                 while (true)
3910                 {
3911                     stateStack[n].activeMask &= ~mask;
3912                     if (stateStack[n].isLoop || stateStack[n].isSwitch)
3913                         break;
3914 
3915                     n--;
3916                 }
3917             }
3918             break;
3919             case OP_CONTINUE:
3920             {
3921                 uint32_t n        = nesting;
3922                 bitset_inv_t mask = stateStack[nesting].activeMask;
3923                 while (true)
3924                 {
3925                     stateStack[n].activeMask &= ~mask;
3926                     if (stateStack[n].isLoop)
3927                     {
3928                         stateStack[n].continueMask |= mask;
3929                         break;
3930                     }
3931                     n--;
3932                 }
3933             }
3934             break;
3935             case OP_ELECT:
3936             {
3937                 nesting++;
3938                 stateStack[nesting].activeMask = bitsetElect(stateStack[nesting - 1].activeMask, subgroupSize);
3939                 stateStack[nesting].header     = i;
3940                 stateStack[nesting].isLoop     = 0;
3941                 stateStack[nesting].isSwitch   = 0;
3942             }
3943             break;
3944             case OP_RETURN:
3945             {
3946                 bitset_inv_t mask = stateStack[nesting].activeMask;
3947                 for (int32_t n = nesting; n >= 0; --n)
3948                 {
3949                     stateStack[n].activeMask &= ~mask;
3950                     if (stateStack[n].isCall)
3951                         break;
3952                 }
3953             }
3954             break;
3955 
3956             case OP_CALL_BEGIN:
3957                 nesting++;
3958                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3959                 stateStack[nesting].isLoop     = 0;
3960                 stateStack[nesting].isSwitch   = 0;
3961                 stateStack[nesting].isCall     = 1;
3962                 break;
3963             case OP_CALL_END:
3964                 stateStack[nesting].isCall = 0;
3965                 nesting--;
3966                 break;
3967             case OP_NOISE:
3968                 break;
3969 
3970             case OP_SWITCH_UNIF_BEGIN:
3971             case OP_SWITCH_VAR_BEGIN:
3972             case OP_SWITCH_LOOP_COUNT_BEGIN:
3973                 nesting++;
3974                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3975                 stateStack[nesting].header     = i;
3976                 stateStack[nesting].isLoop     = 0;
3977                 stateStack[nesting].isSwitch   = 1;
3978                 break;
3979             case OP_SWITCH_END:
3980                 nesting--;
3981                 break;
3982             case OP_CASE_MASK_BEGIN:
3983                 stateStack[nesting].activeMask =
3984                     stateStack[nesting - 1].activeMask & bitsetFromU64(ops[i].value, subgroupSize);
3985                 break;
3986             case OP_CASE_LOOP_COUNT_BEGIN:
3987             {
3988                 uint32_t n = nesting;
3989                 uint32_t l = loopNesting;
3990 
3991                 while (true)
3992                 {
3993                     if (stateStack[n].isLoop)
3994                     {
3995                         l--;
3996                         if (l == ops[stateStack[nesting].header].value)
3997                             break;
3998                     }
3999                     n--;
4000                 }
4001 
4002                 if ((1ULL << stateStack[n].tripCount) & ops[i].value)
4003                     stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
4004                 else
4005                     stateStack[nesting].activeMask = 0;
4006                 break;
4007             }
4008             case OP_CASE_END:
4009                 break;
4010 
4011             default:
4012                 DE_ASSERT(0);
4013                 break;
4014             }
4015             i++;
4016         }
4017         uint32_t maxLoc = 0;
4018         for (uint32_t id = 0; id < (uint32_t)outLoc.size(); ++id)
4019             maxLoc = de::max(maxLoc, outLoc[id]);
4020 
4021         return maxLoc;
4022     }
4023 
4024 protected:
genIf(IFType ifType,uint32_t)4025     virtual void genIf(IFType ifType, uint32_t /*maxLocalIndexCmp*/) override
4026     {
4027         RandomProgram::genIf(ifType, std::min(64u, (caseDef.sizeX * quadInvocationCount - 1)));
4028     }
4029 
printIfLocalInvocationIndex(add_ref<std::stringstream> css,add_cref<FlowState> flow)4030     virtual void printIfLocalInvocationIndex(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
4031     {
4032         // uint invocationIndex() { return gl_PrimitiveID * width + gl_SubgroupInvocationID; }
4033         printIndent(css);
4034         css << "if (";
4035         if (ifLocalInvocationIndexAsSubgroupInvocationID)
4036             css << "gl_SubgroupInvocationID";
4037         else
4038             css << "((((gl_PrimitiveID * width) / gl_SubgroupSize) * gl_SubgroupSize) + gl_SubgroupInvocationID)";
4039         css << " >= inputA.a[0x" << std::hex << flow.ops[flow.opsIndex].value << "]) {\n";
4040     }
4041 
printStore(add_ref<std::stringstream> css,add_cref<FlowState> flow)4042     virtual void printStore(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
4043     {
4044         printIndent(css);
4045         css << "outputC.loc[invocationIndex()]++;\n";
4046         printIndent(css);
4047         css << "outputB.b[(outLoc++)*invocationStride + invocationIndex()].x = 0x" << std::hex
4048             << flow.ops[flow.opsIndex].value << ";\n";
4049     }
4050 
printBallot(add_ref<std::stringstream> css,add_cref<FlowState>,bool endWithSemicolon=false)4051     virtual void printBallot(add_ref<std::stringstream> css, add_cref<FlowState>,
4052                              bool endWithSemicolon = false) override
4053     {
4054         printIndent(css);
4055 
4056         css << "outputC.loc[invocationIndex()]++,";
4057         // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
4058         // a ballot, to make sure the ballot is "diverged enough". Don't do this for
4059         // subgroup_uniform_control_flow, since we only validate results that must be fully
4060         // reconverged.
4061         if (loopNesting > 0 && caseDef.testType == TT_MAXIMAL)
4062         {
4063             css << "outputB.b[(outLoc++)*invocationStride + invocationIndex()] = " << getPartitionBallotText() << ".xy";
4064         }
4065         else
4066         {
4067             css << "outputB.b[(outLoc++)*invocationStride + invocationIndex()] = subgroupBallot(true).xy";
4068         }
4069         if (endWithSemicolon)
4070         {
4071             css << ";\n";
4072         }
4073     }
4074 
simulateStoreToChange(bool countOnly,uint32_t,const SubgroupState (& stateStack)[10],int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<uint64_t>> ref)4075     void simulateStoreToChange(bool countOnly, uint32_t /*subgroupSize*/, const SubgroupState (&stateStack)[10],
4076                                int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
4077                                add_ref<std::vector<uint64_t>> ref)
4078     {
4079         for (uint32_t id = 0; id < invocationStride; ++id)
4080         {
4081             if (stateStack[nesting].activeMask.test(id))
4082             {
4083                 if (countOnly)
4084                     outLoc[id]++;
4085                 else
4086                     ref[(outLoc[id]++) * invocationStride + id] = ops[opsIndex].value;
4087             }
4088         }
4089     }
4090 
simulateBallotToChange(bool countOnly,uint32_t subgroupSize,const SubgroupState (& stateStack)[10],uint32_t,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<uint64_t>> ref)4091     void simulateBallotToChange(bool countOnly, uint32_t subgroupSize, const SubgroupState (&stateStack)[10],
4092                                 uint32_t /*opsIndex*/, add_ref<std::vector<uint32_t>> outLoc,
4093                                 add_ref<std::vector<uint64_t>> ref)
4094     {
4095         for (uint32_t id = 0; id < invocationStride; ++id)
4096         {
4097             if (stateStack[nesting].activeMask.test(id))
4098             {
4099                 if (countOnly)
4100                     outLoc[id]++;
4101                 else
4102                     ref[(outLoc[id]++) * invocationStride + id] =
4103                         bitsetToU64(stateStack[nesting].activeMask, subgroupSize, id);
4104             }
4105         }
4106     }
4107 };
4108 
4109 class GeometryRandomProgram : public RandomProgram
4110 {
4111 public:
4112     static const constexpr uint32_t fillPercentage = 71u;
GeometryRandomProgram(add_cref<CaseDef> c)4113     GeometryRandomProgram(add_cref<CaseDef> c)
4114         : RandomProgram(c, Arrangement::calculatePrimitiveCount(c.sizeX, c.sizeY, fillPercentage))
4115     {
4116         DE_ASSERT(c.shaderStage == VK_SHADER_STAGE_GEOMETRY_BIT);
4117     }
4118     virtual ~GeometryRandomProgram() = default;
4119 
4120     struct Arrangement : Prerequisites
4121     {
4122         static constexpr uint32_t NUM_SUBGROUPS_OFFSET    = 0u;
4123         static constexpr uint32_t SUBGROUP_SIZE_OFFSET    = 1u;
4124         static constexpr uint32_t INVOCATION_COUNT_OFFSET = 2u;
4125         static constexpr uint32_t MAX_LOC_OFFSET          = 3u;
4126         static constexpr uint32_t MAX_IDENTITY_OFFSET     = 4u;
4127         static constexpr uint32_t INVOCATION_ENTRY_OFFSET = 5u;
4128 
4129         const uint32_t m_shaderSubgroupSize;
4130         const uint32_t m_shaderSubgroupCount;
4131         const uint32_t m_shaderInvocationCount;
4132         const uint32_t m_shaderMaxLoc;
4133         const uint32_t m_shaderMaxIdentity;
4134 
4135         const uint32_t m_subgroupSize;
4136         const uint32_t m_primitiveStride;
4137         const uint32_t m_invocationStride;
4138         const uint32_t m_subgroupCount;
4139         const Ballots m_initialBallots;
4140         const std::vector<uint32_t> m_primitiveSubgroups;
4141 
Arrangementvkt::Reconvergence::__anone030def80111::GeometryRandomProgram::Arrangement4142         Arrangement(add_cref<std::vector<uint32_t>> outputP, uint32_t subgroupSize, uint32_t primitiveStride)
4143             : m_shaderSubgroupSize(outputP.at(SUBGROUP_SIZE_OFFSET))
4144             , m_shaderSubgroupCount(outputP.at(NUM_SUBGROUPS_OFFSET))
4145             , m_shaderInvocationCount(outputP.at(INVOCATION_COUNT_OFFSET))
4146             , m_shaderMaxLoc(outputP.at(MAX_LOC_OFFSET))
4147             , m_shaderMaxIdentity(outputP.at(MAX_IDENTITY_OFFSET))
4148             , m_subgroupSize(subgroupSize)
4149             , m_primitiveStride(primitiveStride)
4150             , m_invocationStride(primitiveStride)
4151             , m_subgroupCount(ROUNDUP(primitiveStride, subgroupSize) / subgroupSize)
4152             , m_initialBallots(makeInitialBallots(outputP))
4153             , m_primitiveSubgroups(makePrimitiveSubgroups(outputP))
4154         {
4155         }
makeInitialBallotsvkt::Reconvergence::__anone030def80111::GeometryRandomProgram::Arrangement4156         static Ballots makeInitialBallots(add_cref<std::vector<uint32_t>> outputP)
4157         {
4158             const uint32_t subgroupCount = outputP.at(NUM_SUBGROUPS_OFFSET);
4159             const uint32_t subgroupSize  = outputP.at(SUBGROUP_SIZE_OFFSET);
4160             DE_UNREF(subgroupSize);
4161             const uint32_t primitiveStride = outputP.at(INVOCATION_COUNT_OFFSET);
4162             Ballots b(subgroupCount);
4163             for (uint32_t primitiveID = 0u; primitiveID < primitiveStride; ++primitiveID)
4164             {
4165                 const uint32_t id = outputP.at(primitiveID + INVOCATION_ENTRY_OFFSET);
4166                 if (id)
4167                 {
4168                     const uint32_t subgroupID           = (id >> 16) - 1u;
4169                     const uint32_t subgroupInvocationID = id & 0xFFFF;
4170                     DE_ASSERT(subgroupID < subgroupCount);
4171                     DE_ASSERT(subgroupInvocationID < subgroupSize);
4172                     b.at(subgroupID).set(subgroupInvocationID);
4173                 }
4174             }
4175             return b;
4176         }
makePrimitiveSubgroupsvkt::Reconvergence::__anone030def80111::GeometryRandomProgram::Arrangement4177         static std::vector<uint32_t> makePrimitiveSubgroups(add_cref<std::vector<uint32_t>> outputP)
4178         {
4179             const uint32_t subgroupSize    = outputP.at(SUBGROUP_SIZE_OFFSET);
4180             const uint32_t primitiveStride = outputP.at(INVOCATION_COUNT_OFFSET);
4181             std::vector<uint32_t> map(primitiveStride);
4182             for (uint32_t primitiveID = 0u; primitiveID < primitiveStride; ++primitiveID)
4183             {
4184                 const uint32_t id = outputP.at(primitiveID + INVOCATION_ENTRY_OFFSET);
4185                 if (id)
4186                 {
4187                     const uint32_t subgroupID           = (id >> 16) - 1u;
4188                     const uint32_t subgroupInvocationID = id & 0xFFFF;
4189                     DE_ASSERT(subgroupInvocationID < subgroupSize);
4190                     map.at(primitiveID) = subgroupID * subgroupSize + subgroupInvocationID;
4191                 }
4192             }
4193             return map;
4194         }
calculatePrimitiveCountvkt::Reconvergence::__anone030def80111::GeometryRandomProgram::Arrangement4195         static uint32_t calculatePrimitiveCount(uint32_t width, uint32_t height, uint32_t fillPercent)
4196         {
4197             deRandom rnd;
4198             std::map<uint32_t, int> map;
4199             std::vector<tcu::Vec4> points;
4200             const uint32_t frags = (width * height);
4201             const uint32_t total = (frags * fillPercent) / 100u;
4202 
4203             deRandom_init(&rnd, (width * height));
4204 
4205             for (uint32_t i = 0u; i < total; ++i)
4206             {
4207                 const uint32_t r = deRandom_getUint32(&rnd) % frags;
4208                 if (map[r] != 0)
4209                 {
4210                     i -= 1;
4211                     continue;
4212                 }
4213                 map[r] = 1;
4214             }
4215 
4216             return static_cast<uint32_t>(map.size());
4217         }
generatePrimitivesvkt::Reconvergence::__anone030def80111::GeometryRandomProgram::Arrangement4218         static std::vector<tcu::Vec4> generatePrimitives(uint32_t width, uint32_t height, uint32_t fillPercent)
4219         {
4220             deRandom rnd;
4221             std::map<uint32_t, int> map;
4222             std::vector<tcu::Vec4> points;
4223             const uint32_t frags = (width * height);
4224             const uint32_t total = (frags * fillPercent) / 100u;
4225 
4226             deRandom_init(&rnd, (width * height));
4227 
4228             for (uint32_t i = 0u; i < total; ++i)
4229             {
4230                 const uint32_t r = deRandom_getUint32(&rnd) % frags;
4231                 if (map[r] != 0)
4232                 {
4233                     i -= 1;
4234                     continue;
4235                 }
4236                 map[r] = 1;
4237 
4238                 uint32_t y = r / width;
4239                 uint32_t x = r % width;
4240                 float xx   = (float(x) + float(x + 1)) / (2.0f * float(width));
4241                 float yy   = (float(y) + float(y + 1)) / (2.0f * float(height));
4242                 float xxx  = xx * 2.0f - 1.0f;
4243                 float yyy  = yy * 2.0f - 1.0f;
4244                 points.emplace_back(tcu::Vec4(xxx, yyy, 0u, 0u));
4245             }
4246             return points;
4247         }
generateVectorOutputPvkt::Reconvergence::__anone030def80111::GeometryRandomProgram::Arrangement4248         static std::vector<uint32_t> generateVectorOutputP(uint32_t subgroupSize, uint32_t primitiveStride)
4249         {
4250             const uint32_t subgroupCount = ROUNDUP(primitiveStride, subgroupSize) / subgroupSize;
4251             std::vector<uint32_t> outputP(primitiveStride + INVOCATION_ENTRY_OFFSET);
4252             outputP.at(NUM_SUBGROUPS_OFFSET)    = subgroupCount;
4253             outputP.at(SUBGROUP_SIZE_OFFSET)    = subgroupSize;
4254             outputP.at(INVOCATION_COUNT_OFFSET) = primitiveStride;
4255             outputP.at(MAX_LOC_OFFSET)          = 0u;
4256             outputP.at(MAX_IDENTITY_OFFSET)     = 0u;
4257             for (uint32_t vertexID = 0u; vertexID < primitiveStride; ++vertexID)
4258             {
4259                 const uint32_t subgroupID                      = vertexID / subgroupSize;
4260                 const uint32_t subgroupInvocationID            = vertexID % subgroupSize;
4261                 outputP.at(vertexID + INVOCATION_ENTRY_OFFSET) = ((subgroupID + 1u) << 16) | subgroupInvocationID;
4262             }
4263             return outputP;
4264         }
generateVectorOutputPvkt::Reconvergence::__anone030def80111::GeometryRandomProgram::Arrangement4265         static std::vector<uint32_t> generateVectorOutputP(uint32_t subgroupSize, uint32_t width, uint32_t height,
4266                                                            uint32_t percent)
4267         {
4268             const uint32_t primitiveStride = calculatePrimitiveCount(width, height, percent);
4269             return generateVectorOutputP(subgroupSize, primitiveStride);
4270         }
4271     };
4272 
simulate(bool countOnly,uint32_t subgroupSize,add_ref<std::vector<uint64_t>> ref)4273     virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) override
4274     {
4275         DE_ASSERT(false); // use overloaded version of simulate() instead
4276         DE_UNREF(countOnly);
4277         DE_UNREF(subgroupSize);
4278         DE_UNREF(ref);
4279         return 0;
4280     }
4281 
4282 protected:
genIf(IFType ifType,uint32_t)4283     virtual void genIf(IFType ifType, uint32_t /*maxLocalIndexCmp*/) override
4284     {
4285         RandomProgram::genIf(ifType, RandomProgram::invocationStride);
4286     }
4287 
getPartitionBallotText()4288     virtual std::string getPartitionBallotText() override
4289     {
4290         return "storeValue(outLoc++, subgroupBallot(true))";
4291     }
4292 
printIfLocalInvocationIndex(add_ref<std::stringstream> css,add_cref<FlowState> flow)4293     virtual void printIfLocalInvocationIndex(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
4294     {
4295         printIndent(css);
4296         css << "if (invocationIndex() >= inputA.a[0x" << std::hex << flow.ops[flow.opsIndex].value << "]) {\n";
4297     }
4298 
printStore(add_ref<std::stringstream> css,add_cref<FlowState> flow)4299     virtual void printStore(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
4300     {
4301         printIndent(css);
4302         css << "storeValue(outLoc++, 0x" << std::hex << flow.ops[flow.opsIndex].value << std::dec << ");\n";
4303     }
4304 
printBallot(add_ref<std::stringstream> css,add_cref<FlowState>,bool endWithSemicolon=false)4305     virtual void printBallot(add_ref<std::stringstream> css, add_cref<FlowState>,
4306                              bool endWithSemicolon = false) override
4307     {
4308         printIndent(css);
4309         // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
4310         // a ballot, to make sure the ballot is "diverged enough". Don't do this for
4311         // subgroup_uniform_control_flow, since we only validate results that must be fully
4312         // reconverged.
4313         if (loopNesting > 0 && caseDef.testType == TT_MAXIMAL)
4314         {
4315             css << getPartitionBallotText();
4316         }
4317         else
4318         {
4319             css << "storeValue(outLoc++, subgroupBallot(true))";
4320         }
4321         if (endWithSemicolon)
4322         {
4323             css << ";\n";
4324         }
4325     }
4326 
simulateBallot(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t unusedPrimitiveID,const int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)4327     virtual void simulateBallot(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t unusedPrimitiveID,
4328                                 const int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
4329                                 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
4330                                 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
4331                                 const OPType reason, const tcu::UVec4 *cmp) override
4332     {
4333         DE_UNREF(unusedPrimitiveID);
4334         DE_UNREF(opsIndex);
4335         add_cref<Arrangement> a(*std::static_pointer_cast<Arrangement>(prerequisites));
4336         for (uint32_t primitiveID = 0u; primitiveID < a.m_primitiveStride; ++primitiveID)
4337         {
4338             const uint32_t sgid = a.m_primitiveSubgroups.at(primitiveID);
4339             DE_ASSERT(sgid < (a.m_subgroupCount * a.m_subgroupSize));
4340             if (false == activeMask.test(Ballots::findBit(sgid, a.m_subgroupSize)))
4341                 continue;
4342             const uint32_t index = (outLoc.at(primitiveID)++) * a.m_invocationStride + primitiveID;
4343             if (false == countOnly)
4344             {
4345                 ref.at(index) = Ballot(activeMask.at(sgid / a.m_subgroupSize));
4346                 if (cmp && logFailureCount > 0u && cmp[index] != ref.at(index))
4347                 {
4348                     logFailureCount -= 1u;
4349                     log << tcu::TestLog::Message << logFailureCount << ": stored value mismatch from "
4350                         << OPtypeToStr(reason) << tcu::TestLog::EndMessage;
4351                 }
4352             }
4353         }
4354     }
4355 
simulateStore(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t unusedPrimitiveID,const uint64_t storeValue,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)4356     virtual void simulateStore(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t unusedPrimitiveID,
4357                                const uint64_t storeValue, add_ref<std::vector<uint32_t>> outLoc,
4358                                add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
4359                                std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
4360                                const OPType reason, const tcu::UVec4 *cmp) override
4361     {
4362         DE_UNREF(unusedPrimitiveID);
4363         add_cref<Arrangement> a(*std::static_pointer_cast<Arrangement>(prerequisites));
4364         for (uint32_t primitiveID = 0u; primitiveID < a.m_primitiveStride; ++primitiveID)
4365         {
4366             const uint32_t sgid = a.m_primitiveSubgroups.at(primitiveID);
4367             DE_ASSERT(sgid < (a.m_subgroupCount * a.m_subgroupSize));
4368             if (false == activeMask.test(Ballots::findBit(sgid, a.m_subgroupSize)))
4369                 continue;
4370             const uint32_t index = (outLoc.at(primitiveID)++) * a.m_invocationStride + primitiveID;
4371             if (false == countOnly)
4372             {
4373                 ref.at(index) = Ballot(tcu::UVec4(uint32_t(storeValue & 0xFFFFFFFF), 0u, 0u, 0u));
4374                 if (cmp && logFailureCount > 0u && cmp[index] != ref.at(index))
4375                 {
4376                     logFailureCount -= 1u;
4377                     log << tcu::TestLog::Message << logFailureCount << ": stored value mismatch from "
4378                         << OPtypeToStr(reason) << tcu::TestLog::EndMessage;
4379                 }
4380             }
4381         }
4382     }
4383 
makePrerequisites(add_cref<std::vector<uint32_t>> outputP,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<SubgroupState2>> stateStack,add_ref<std::vector<uint32_t>> outLoc,add_ref<uint32_t> subgroupCount)4384     virtual std::shared_ptr<Prerequisites> makePrerequisites(add_cref<std::vector<uint32_t>> outputP,
4385                                                              const uint32_t subgroupSize, const uint32_t fragmentStride,
4386                                                              const uint32_t primitiveStride,
4387                                                              add_ref<std::vector<SubgroupState2>> stateStack,
4388                                                              add_ref<std::vector<uint32_t>> outLoc,
4389                                                              add_ref<uint32_t> subgroupCount) override
4390     {
4391         DE_UNREF(fragmentStride);
4392         auto prerequisites = std::make_shared<Arrangement>(outputP, subgroupSize, primitiveStride);
4393         subgroupCount      = prerequisites->m_subgroupCount;
4394         stateStack.resize(10u, SubgroupState2(subgroupCount));
4395         outLoc.resize(primitiveStride, 0u);
4396         stateStack.at(0).activeMask = prerequisites->m_initialBallots;
4397         return prerequisites;
4398     }
4399 };
4400 
4401 class ReconvergenceTestCase : public TestCase
4402 {
4403 public:
ReconvergenceTestCase(tcu::TestContext & context,const std::string & name,const CaseDef data)4404     ReconvergenceTestCase(tcu::TestContext &context, const std::string &name, const CaseDef data)
4405         : TestCase(context, name)
4406         , m_data(data)
4407         , m_program()
4408         , m_subgroupSizeToMaxLoc()
4409     {
4410     }
4411     ~ReconvergenceTestCase(void) = default;
4412     virtual void delayedInit(void) override;
4413     virtual void checkSupport(Context &context) const override;
4414     virtual void initPrograms(SourceCollections &programCollection) const override;
4415     virtual TestInstance *createInstance(Context &context) const override;
4416     de::MovePtr<RandomProgram> selectProgram() const;
4417 
4418 private:
4419     CaseDef m_data;
4420     std::shared_ptr<RandomProgram> m_program;
4421     mutable std::map<uint32_t, uint32_t> m_subgroupSizeToMaxLoc;
4422 };
4423 
checkSupport(Context & context) const4424 void ReconvergenceTestCase::checkSupport(Context &context) const
4425 {
4426     if (!context.contextSupports(vk::ApiVersion(0u, 1u, 1u, 0u)))
4427         TCU_THROW(NotSupportedError, "Vulkan 1.1 not supported");
4428 
4429     const auto properties                                            = getSubgroupProperties(context);
4430     const vk::VkPhysicalDeviceSubgroupProperties &subgroupProperties = properties.first;
4431     const VkPhysicalDeviceLimits &limits                             = properties.second.properties.limits;
4432 
4433     if (m_data.isElect() && !(subgroupProperties.supportedOperations & VK_SUBGROUP_FEATURE_BASIC_BIT))
4434         TCU_THROW(NotSupportedError, "VK_SUBGROUP_FEATURE_BASIC_BIT not supported");
4435 
4436     if (!m_data.isElect() && !(subgroupProperties.supportedOperations & VK_SUBGROUP_FEATURE_BALLOT_BIT))
4437         TCU_THROW(NotSupportedError, "VK_SUBGROUP_FEATURE_BALLOT_BIT not supported");
4438 
4439     if (m_data.shaderStage == VK_SHADER_STAGE_COMPUTE_BIT)
4440     {
4441         if ((m_data.sizeX > limits.maxComputeWorkGroupSize[0]) || (m_data.sizeY > limits.maxComputeWorkGroupSize[1]) ||
4442             ((m_data.sizeX * m_data.sizeY) > limits.maxComputeWorkGroupInvocations))
4443         {
4444             TCU_THROW(NotSupportedError, "compute workgroup count exceeds device limit");
4445         }
4446     }
4447 
4448     if (!(subgroupProperties.supportedStages & m_data.shaderStage))
4449     {
4450         std::stringstream ss;
4451         ss << getShaderStageFlagsStr(m_data.shaderStage);
4452         ss << " does not support subgroup operations";
4453         ss.flush();
4454         TCU_THROW(NotSupportedError, ss.str());
4455     }
4456 
4457     // Both subgroup- AND workgroup-uniform tests are enabled by shaderSubgroupUniformControlFlow.
4458     if (m_data.isUCF() && !context.getShaderSubgroupUniformControlFlowFeatures().shaderSubgroupUniformControlFlow)
4459         TCU_THROW(NotSupportedError, "shaderSubgroupUniformControlFlow not supported");
4460 
4461     if (m_data.testType == TT_MAXIMAL && !context.getShaderMaximalReconvergenceFeatures().shaderMaximalReconvergence)
4462         TCU_THROW(NotSupportedError, "shaderMaximalReconvergence not supported");
4463 }
4464 
selectProgram() const4465 de::MovePtr<RandomProgram> ReconvergenceTestCase::selectProgram() const
4466 {
4467     RandomProgram *programPtr(nullptr);
4468     switch (m_data.shaderStage)
4469     {
4470     case VK_SHADER_STAGE_COMPUTE_BIT:
4471         programPtr = new ComputeRandomProgram(m_data);
4472         break;
4473     case VK_SHADER_STAGE_FRAGMENT_BIT:
4474         programPtr = new FragmentRandomProgram(m_data);
4475         break;
4476     case VK_SHADER_STAGE_VERTEX_BIT:
4477         programPtr = new VertexRandomProgram(m_data);
4478         break;
4479     case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
4480         programPtr = new TessCtrlRandomProgram(m_data, 0);
4481         break;
4482     case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
4483         programPtr = new TessEvalRandomProgram(m_data);
4484         break;
4485     case VK_SHADER_STAGE_GEOMETRY_BIT:
4486         programPtr = new GeometryRandomProgram(m_data);
4487         break;
4488     default:
4489         DE_ASSERT(0);
4490     }
4491     DE_ASSERT(programPtr);
4492     return de::MovePtr<RandomProgram>(programPtr);
4493 }
4494 
genPassThroughFragmentSource()4495 std::string genPassThroughFragmentSource()
4496 {
4497     std::stringstream str;
4498     str << "#version 450 core\n";
4499     str << "layout(location = 0) out vec4 color;\n";
4500     str << "void main() {\n";
4501     str << "  color = vec4(1.0);\n";
4502     str << "}\n";
4503     str.flush();
4504     return str.str();
4505 }
4506 
genPassThroughVertexSource()4507 std::string genPassThroughVertexSource()
4508 {
4509     std::stringstream str;
4510     str << "#version 450 core\n";
4511     str << "layout(location = 0) in vec4 pos;\n";
4512     str << "void main() {\n";
4513     str << "   gl_Position = vec4(pos.xy, 0.0, 1.0);\n";
4514     str << "}\n";
4515     str.flush();
4516     return str.str();
4517 }
4518 
genPassThroughTessCtrlSource()4519 std::string genPassThroughTessCtrlSource()
4520 {
4521     std::stringstream str;
4522     str << "#version 450 core\n";
4523     str << "#extension GL_EXT_tessellation_shader : require\n";
4524     str << "layout(vertices = 3) out;\n";
4525     str << "void main() {\n";
4526     str << "   gl_out[gl_InvocationID].gl_Position = gl_in[gl_InvocationID].gl_Position;\n";
4527     str << "   gl_TessLevelOuter[0] = 1.0;\n";
4528     str << "   gl_TessLevelOuter[1] = 1.0;\n";
4529     str << "   gl_TessLevelOuter[2] = 1.0;\n";
4530     str << "   gl_TessLevelOuter[3] = 1.0;\n";
4531     str << "   gl_TessLevelInner[0] = 1.0;\n";
4532     str << "   gl_TessLevelInner[1] = 1.0;\n";
4533     str << "}\n";
4534     str.flush();
4535     return str.str();
4536 }
4537 
genPassThroughTessEvalSource()4538 std::string genPassThroughTessEvalSource()
4539 {
4540     std::stringstream str;
4541     str << "#version 450 core\n";
4542     str << "#extension GL_EXT_tessellation_shader : require\n";
4543     str << "layout(equal_spacing, triangles) in;\n";
4544     str << "void main() {\n";
4545     str << "   float u = gl_TessCoord.x;\n";
4546     str << "   float v = gl_TessCoord.y;\n";
4547     str << "   float w = gl_TessCoord.z;\n";
4548     str << "   vec4 p0 = vec4(gl_in[0].gl_Position.xy, 0.0, 1.0);\n";
4549     str << "   vec4 p1 = vec4(gl_in[1].gl_Position.xy, 0.0, 1.0);\n";
4550     str << "   vec4 p2 = vec4(gl_in[2].gl_Position.xy, 0.0, 1.0);\n";
4551     str << "   gl_Position = u * p0 + v * p1 + w * p2;\n";
4552     str << "}\n";
4553     str.flush();
4554     return str.str();
4555 }
4556 
delayedInit(void)4557 void ReconvergenceTestCase::delayedInit(void)
4558 {
4559     m_program = std::shared_ptr<RandomProgram>(selectProgram().release());
4560 }
4561 
initPrograms(SourceCollections & programCollection) const4562 void ReconvergenceTestCase::initPrograms(SourceCollections &programCollection) const
4563 {
4564     de::MovePtr<RandomProgram> program = selectProgram();
4565 
4566     m_subgroupSizeToMaxLoc = program->generateRandomProgram(m_testCtx.getWatchDog(), m_testCtx.getLog());
4567 
4568     std::stringstream header, layout, globals, prologue, epilogue, aux;
4569 
4570     header << "#version 450 core\n";
4571     header << "#extension GL_KHR_shader_subgroup_ballot : enable\n";
4572     header << "#extension GL_KHR_shader_subgroup_vote : enable\n";
4573     header << "#extension GL_NV_shader_subgroup_partitioned : enable\n";
4574     header << "#extension GL_EXT_subgroup_uniform_control_flow : enable\n";
4575     if (m_data.testType == TT_MAXIMAL)
4576     {
4577         header << "#extension GL_EXT_maximal_reconvergence : require\n";
4578     }
4579     switch (m_data.shaderStage)
4580     {
4581     case VK_SHADER_STAGE_COMPUTE_BIT:
4582         layout << "layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z = 1) in;\n";
4583         layout << "layout(set=0, binding=2) coherent buffer OutputC { uint loc[]; } outputC;\n";
4584         layout << "layout(set=0, binding=1) coherent buffer OutputB { uvec4 b[]; } outputB;\n";
4585         layout << "layout(set=0, binding=0) coherent buffer InputA  { uint  a[]; } inputA;\n";
4586         break;
4587     case VK_SHADER_STAGE_FRAGMENT_BIT:
4588         layout << "// NOTE: A fragment can belong to more than one primitive, and the shader processes each\n";
4589         layout << "//       fragment primitive by primitive, so the number of invocation does not have to be\n";
4590         layout << "//       equal to the number of fragments of the rendering area. Another important thing\n";
4591         layout << "//       is that the Implementation is free to change the order of draving primitives\n";
4592         layout << "//       between subsequent application calls.\n";
4593 
4594         layout << "// inputA.a[ invocationStride ] = { 0, 1, ..., (invocationStride - 1) }\n";
4595         layout << "layout(set=0, binding=0) coherent buffer InputA  { uint  a[]; } inputA;\n";
4596 
4597         layout << "// outputB.b[ max(loc[]) * invocationStride * primitiveStride ]\n";
4598         layout << "layout(set=0, binding=1) coherent buffer OutputB { uvec4 b[]; } outputB;\n";
4599 
4600         layout << "// outputC.c[invocationStride * primitiveStride ], incremented per primitive\n";
4601         layout << "layout(set=0, binding=2) coherent buffer OutputC { uint  loc[]; } outputC;\n";
4602 
4603         layout << "// outputP.p[ width * height * primitiveStride + 1 ], one more for calculating subgroupID\n";
4604         layout << "layout(set=0, binding=3) coherent buffer OutputP { uint  p[]; } outputP;\n";
4605 
4606         layout << "layout(location = 0) out vec4 dEQP_FragColor;\n";
4607         break;
4608     case VK_SHADER_STAGE_VERTEX_BIT:
4609         layout << "layout(location = 0) in vec4 pos;\n";
4610         layout << "layout(set=0, binding=3) coherent buffer OutputP { uint  p[]; } outputP;\n";
4611         layout << "layout(set=0, binding=2) coherent buffer OutputC { uint loc[]; } outputC;\n";
4612         layout << "layout(set=0, binding=1) coherent buffer OutputB { uvec4 b[]; } outputB;\n";
4613         layout << "layout(set=0, binding=0) coherent buffer InputA  { uint  a[]; } inputA;\n";
4614         break;
4615     case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
4616         layout << "#extension GL_EXT_tessellation_shader : require\n";
4617         layout << "layout(vertices = " << TessCtrlRandomProgram::minSubgroupSize << ") out;\n";
4618         layout << "layout(set=0, binding=2) coherent buffer OutputC { uint loc[]; } outputC;\n";
4619         layout << "layout(set=0, binding=1) coherent buffer OutputB { uvec2 b[]; } outputB;\n";
4620         layout << "layout(set=0, binding=0) coherent buffer InputA  { uint  a[]; } inputA;\n";
4621         break;
4622     case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
4623         layout << "#extension GL_EXT_tessellation_shader : require\n";
4624         layout << "layout(equal_spacing, quads) in;\n";
4625         layout << "layout(set=0, binding=2) coherent buffer OutputC { uint loc[]; } outputC;\n";
4626         layout << "layout(set=0, binding=1) coherent buffer OutputB { uvec2 b[]; } outputB;\n";
4627         layout << "layout(set=0, binding=0) coherent buffer InputA  { uint  a[]; } inputA;\n";
4628         break;
4629     case VK_SHADER_STAGE_GEOMETRY_BIT:
4630         layout << "#extension GL_EXT_geometry_shader : require\n";
4631         layout << "layout(points) in;\n";
4632         layout << "layout(points, max_vertices = 1) out;\n";
4633         layout << "layout(set=0, binding=3) coherent buffer OutputP { uint  p[]; } outputP;\n";
4634         layout << "layout(set=0, binding=2) coherent buffer OutputC { uint loc[]; } outputC;\n";
4635         layout << "layout(set=0, binding=1) coherent buffer OutputB { uvec4 b[]; } outputB;\n";
4636         layout << "layout(set=0, binding=0) coherent buffer InputA  { uint  a[]; } inputA;\n";
4637         break;
4638     default:
4639         DE_ASSERT(0);
4640     }
4641 
4642     std::stringstream pushConstantLayout;
4643     pushConstantLayout
4644         << "layout(push_constant) uniform PC {\n"
4645            "   // set to the real stride when writing out ballots, or zero when just counting\n"
4646            "   int  invocationStride;\n"
4647            "   // wildcard fields, for an example the dimensions of rendered area in the case of graphics shaders\n"
4648            "   int  width;\n"
4649            "   int  height;\n"
4650            "   uint primitiveStride;\n"
4651            "   uint subgroupStride;\n"
4652            "   uint enableInvocationIndex;\n"
4653            "};\n";
4654     pushConstantLayout.flush();
4655     layout << pushConstantLayout.str();
4656 
4657     globals << "int outLoc = 0;\n";
4658     globals << "bool testBit(uvec4 mask, uint bit) { return ((mask[bit / 32] >> (bit % 32)) & 1) != 0; }\n";
4659     globals << "uint elect() { return int(subgroupElect()) + 1; }\n";
4660     if (m_data.shaderStage == VK_SHADER_STAGE_FRAGMENT_BIT)
4661     {
4662         static const std::string helperRoutinesCode(R"glsl(
4663         void setBit(uint bit, in out uvec4 ballot) {
4664             uint c = bit / 32;
4665             switch (c) {
4666                 case 0: ballot.x |= (1u << (bit % 32)); break;
4667                 case 1: ballot.y |= (1u << (bit % 32)); break;
4668                 case 2: ballot.z |= (1u << (bit % 32)); break;
4669                 case 3: ballot.w |= (1u << (bit % 32)); break;
4670             }
4671         }
4672         void resetBit(uint bit, in out uvec4 ballot) {
4673             uint c = bit / 32;
4674             uint mask = 0xFFFFFFFF ^ (1u << (bit % 32));
4675             switch (c) {
4676                 case 0: ballot.x &= mask; break;
4677                 case 1: ballot.y &= mask; break;
4678                 case 2: ballot.z &= mask; break;
4679                 case 3: ballot.w &= mask; break;
4680             }
4681         }
4682         uint fragmentIndex() { return (uint(gl_FragCoord.y) * width + uint(gl_FragCoord.x)); }
4683         uint invocationIndex() { return subgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; }
4684         uvec4 invocationElectBallot() {
4685             uvec4 ballot = uvec4(0);
4686             ballot[gl_SubgroupInvocationID / 32] = (1 << (gl_SubgroupInvocationID % 32));
4687             return ballot;
4688         }
4689         uint next(uint hint) {
4690             return gl_HelperInvocation
4691                 ? (hint * enableInvocationIndex)
4692                 : outputC.loc[(gl_PrimitiveID * (subgroupStride * 128) + invocationIndex()) * enableInvocationIndex]++;
4693         }
4694         uint index(uint hint) {
4695             return ((
4696                 next(hint) * (subgroupStride * 128 * primitiveStride)
4697                 + (gl_PrimitiveID * subgroupStride * 128) + invocationIndex()) * enableInvocationIndex);
4698         }
4699         void storeValue(uint hintIndex, uvec4 value)
4700         {
4701             if (gl_HelperInvocation) {
4702                 if (hintIndex < BALLOT_STACK_SIZE)
4703                     ballotStack[hintIndex] = value;
4704             }
4705             else {
4706                 outputB.b[index(hintIndex)] = value;
4707             }
4708         }
4709         void storeValue(uint hintIndex, uint value) { storeValue(hintIndex, uvec4(value, 0, 0, 0)); }
4710         void storeBallot(uint hintIndex) { storeValue(hintIndex, subgroupBallot(true)); }
4711         )glsl");
4712 
4713         static const std::string prologueCode(R"glsl(
4714         uint helperInvocationCount = 0u;
4715         uint nonHelperInvocationCount = 0u;
4716         uvec4 helperInvocationsBits = uvec4(0, 0, 0, 0);
4717         uvec4 nonHelperInvocationsBits = uvec4(0, 0, 0, 0);
4718         if (gl_HelperInvocation)
4719         {
4720             helperInvocationsBits = subgroupBallot(true);
4721             helperInvocationCount = 1u;
4722         }
4723         else
4724         {
4725             nonHelperInvocationsBits = subgroupBallot(true);
4726             nonHelperInvocationCount = 1u;
4727         }
4728 
4729         helperInvocationsBits = subgroupOr(helperInvocationsBits);
4730         nonHelperInvocationsBits = subgroupOr(nonHelperInvocationsBits);
4731         uint helperBitCount = subgroupBallotBitCount(helperInvocationsBits);
4732         uint nonHelperBitCount = subgroupBallotBitCount(nonHelperInvocationsBits);
4733         helperInvocationCount = subgroupAdd(helperInvocationCount);
4734         nonHelperInvocationCount = subgroupAdd(nonHelperInvocationCount);
4735 
4736         const uint nonHelperElectBit = subgroupBallotFindLSB(nonHelperInvocationsBits);
4737         if (gl_SubgroupInvocationID == nonHelperElectBit)
4738         {
4739             subgroupID = atomicAdd(outputP.p[width * height * primitiveStride + 0], 1);
4740             outputP.p[width * height * primitiveStride + 1] = gl_SubgroupSize;
4741             atomicAdd(outputP.p[width * height * primitiveStride + 2], nonHelperInvocationCount);
4742             atomicAdd(outputP.p[width * height * primitiveStride + 3], helperInvocationCount);
4743         }
4744 
4745         subgroupID = subgroupShuffle(subgroupID, nonHelperElectBit);
4746 
4747         const uint localPrimitiveID = gl_PrimitiveID;
4748         const uint localFragmentID = fragmentIndex();
4749 
4750         if (!gl_HelperInvocation)
4751         {
4752             outputP.p[localFragmentID * primitiveStride + localPrimitiveID] =
4753                 ((subgroupID + 1) << 16) | gl_SubgroupInvocationID;
4754         }
4755 
4756         // Maping helper invocations block
4757         {
4758             uvec4 tmpHelperBits = helperInvocationsBits;
4759             uint helperSubgroupInvocationID = subgroupBallotFindLSB(tmpHelperBits);
4760             while (subgroupBallotBitExtract(tmpHelperBits, helperSubgroupInvocationID))
4761             {
4762                 uint helperSubgroupID = subgroupShuffle(subgroupID, helperSubgroupInvocationID);
4763                 uint helperFragmentID = subgroupShuffle(localFragmentID, helperSubgroupInvocationID);
4764                 uint helperPrimitiveID = subgroupShuffle(localPrimitiveID, helperSubgroupInvocationID);
4765                 if (gl_SubgroupInvocationID == nonHelperElectBit)
4766                 {
4767                     outputP.p[helperFragmentID * primitiveStride + helperPrimitiveID] =
4768                         (((helperSubgroupID + 1) | 0x8000) << 16) | helperSubgroupInvocationID;
4769                 }
4770                 resetBit(helperSubgroupInvocationID, tmpHelperBits);
4771                 helperSubgroupInvocationID = subgroupBallotFindLSB(tmpHelperBits);
4772             }
4773         }
4774         )glsl");
4775 
4776         static const std::string epilogueCode(R"glsl(
4777         // Save helper invocations entries block
4778         {
4779             uvec4 tmpHelperBits = subgroupOr(helperInvocationsBits);
4780             uint helperSubgroupInvocationID = subgroupBallotFindLSB(tmpHelperBits);
4781             while (helperSubgroupInvocationID < gl_SubgroupSize)
4782             {
4783                 const uint maxOutLoc = subgroupShuffle(outLoc, helperSubgroupInvocationID);
4784                 if (maxOutLoc == 0)
4785                 {
4786                     resetBit(helperSubgroupInvocationID, tmpHelperBits);
4787                     helperSubgroupInvocationID = subgroupBallotFindLSB(tmpHelperBits);
4788                     continue;
4789                 }
4790 
4791                 uvec4 helperBallotStack[BALLOT_STACK_SIZE];
4792                 uint helperSubgroupID = subgroupShuffle(subgroupID, helperSubgroupInvocationID);
4793                 uint helperFragmentID = subgroupShuffle(localFragmentID, helperSubgroupInvocationID);
4794                 uint helperPrimitiveID = subgroupShuffle(localPrimitiveID, helperSubgroupInvocationID);
4795                 for (uint i = 0; i < maxOutLoc && i < BALLOT_STACK_SIZE; i++) {
4796                     helperBallotStack[i] = subgroupShuffle(ballotStack[i], helperSubgroupInvocationID);
4797                 }
4798 
4799                 if (gl_SubgroupInvocationID == nonHelperElectBit)
4800                 {
4801                     uint helperInvocationIndex = helperSubgroupID * gl_SubgroupSize + helperSubgroupInvocationID;
4802                     uint helperPrimitiveInvocationIndex = helperInvocationIndex * primitiveStride + helperPrimitiveID;
4803 
4804                     outputC.loc[(helperInvocationIndex * primitiveStride + helperPrimitiveID) * enableInvocationIndex] = maxOutLoc;
4805 
4806                     for (uint j = 0; j < maxOutLoc; j++)
4807                     {
4808                         uint outputIndex = ((j * (subgroupStride * 128u * primitiveStride)
4809                             + (helperPrimitiveID * subgroupStride * 128u) + helperInvocationIndex) * enableInvocationIndex);
4810                         uvec4 outputValue = (j < BALLOT_STACK_SIZE) ? helperBallotStack[j] : uvec4(0,0,0,0);
4811                         outputB.b[outputIndex] = outputValue;
4812                     }
4813                 }
4814                 resetBit(helperSubgroupInvocationID, tmpHelperBits);
4815                 helperSubgroupInvocationID = subgroupBallotFindLSB(tmpHelperBits);
4816             } // wend
4817         }
4818 
4819         dEQP_FragColor = vec4(1.0);
4820         )glsl");
4821 
4822         header << "#extension GL_KHR_shader_subgroup_shuffle : enable\n";
4823         header << "#extension GL_KHR_shader_subgroup_arithmetic : enable\n";
4824         header << "#define BALLOT_STACK_SIZE " << FragmentRandomProgram::experimentalOutLocSize << '\n';
4825 
4826         {
4827             aux << header.str();
4828             aux << pushConstantLayout.str();
4829             aux << "uint outLoc = 0;\n";
4830             aux << "struct OutputC { uint loc[1]; };\n";
4831             aux << "struct OutputB { uvec4 b[1]; };\n";
4832             aux << "uint subgroupID = 11111;\n";
4833             aux << "uvec4 ballotStack[BALLOT_STACK_SIZE];\n";
4834             aux << "OutputC outputC;\n";
4835             aux << "OutputB outputB;\n";
4836             aux << "// OutputP.p[ width * height * primitiveStride + 4 ], few more for calculating subgroupID, "
4837                    "subgroupSize, non-helper and helper invocations\n";
4838             aux << "layout(set = 0, binding = 0) coherent buffer OutputP { uint p[]; } outputP;\n";
4839             aux << "layout(location = 0) out vec4 dEQP_FragColor;\n";
4840             aux << helperRoutinesCode;
4841             aux << "void main() {\n"
4842                 << prologueCode << epilogueCode << "   \n"
4843                 << "}\n";
4844         }
4845 
4846         globals << "uint subgroupID = 22222;\n";
4847         globals << "uvec4 ballotStack[BALLOT_STACK_SIZE];\n";
4848         globals << helperRoutinesCode;
4849 
4850         prologue << prologueCode;
4851         epilogue << epilogueCode;
4852     }
4853     else if (m_data.shaderStage == VK_SHADER_STAGE_VERTEX_BIT)
4854     {
4855         static const std::string helperRoutinesCode(R"glsl(
4856         uint invocationIndex() { return subgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; }
4857         uvec4 invocationElectBallot() {
4858             uvec4 ballot = uvec4(0);
4859             ballot[gl_SubgroupInvocationID / 32] = (1 << (gl_SubgroupInvocationID % 32));
4860             return ballot;
4861         }
4862         void storeValue(uint loc, uvec4 value) {
4863             outputC.loc[gl_VertexIndex] = loc + 1u;
4864             outputB.b[(loc * invocationStride + gl_VertexIndex) * enableInvocationIndex] = value;
4865         }
4866         void storeValue(uint loc, uint value) { storeValue(loc, uvec4(value, 0, 0, 0)); }
4867         )glsl");
4868 
4869         static const std::string prologueCode(R"glsl(
4870         uint invocationCount = 1u;
4871         invocationCount = subgroupAdd(invocationCount);
4872 
4873         if (subgroupElect())
4874         {
4875             subgroupID = atomicAdd(outputP.p[NUM_SUBGROUPS_OFFSET], 1u);    // [+0]    subgroupID
4876             outputP.p[SUBGROUP_SIZE_OFFSET] = gl_SubgroupSize;                // [+1]    subgroupSize
4877             atomicAdd(outputP.p[INVOCATION_COUNT_OFFSET], invocationCount);    // [+2]    invocationCount
4878         }
4879         subgroupID = subgroupBroadcastFirst(subgroupID);
4880 
4881         outputP.p[gl_VertexIndex + INVOCATION_ENTRIES_OFFSET] = ((subgroupID + 1) << 16) | gl_SubgroupInvocationID;
4882         )glsl");
4883 
4884         static const std::string epilogueCode(R"glsl(
4885         gl_Position = vec4(pos.xy, 0.0, 1.0);
4886         gl_PointSize = 1.0;
4887         )glsl");
4888 
4889         header << "#extension GL_KHR_shader_subgroup_arithmetic : enable\n";
4890         header << "#define NUM_SUBGROUPS_OFFSET            0\n";
4891         header << "#define SUBGROUP_SIZE_OFFSET            1\n";
4892         header << "#define INVOCATION_COUNT_OFFSET        2\n";
4893         header << "#define INVOCATION_ENTRIES_OFFSET    3\n";
4894 
4895         globals << "uint subgroupID = 33333;\n";
4896         globals << helperRoutinesCode;
4897 
4898         prologue << prologueCode;
4899         epilogue << epilogueCode;
4900     }
4901     else if (m_data.shaderStage == VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT)
4902     {
4903         // push_constant::width holds the smallest subgroup size defined in TessCtrlRandomProgram::minSubgroupSize
4904         globals << "// push_constant::width is the smallest subgroup size which this shader is run on\n";
4905         globals << "uint invocationIndex() { return ((((gl_PrimitiveID * width) / gl_SubgroupSize) * gl_SubgroupSize) "
4906                    "+ gl_SubgroupInvocationID); }\n";
4907 
4908         epilogue
4909             << "   gl_out[gl_InvocationID].gl_Position = gl_in[gl_InvocationID % gl_PatchVerticesIn].gl_Position;\n";
4910         epilogue << "   gl_TessLevelOuter[0] = 1.0;\n";
4911         epilogue << "   gl_TessLevelOuter[1] = 1.0;\n";
4912         epilogue << "   gl_TessLevelOuter[2] = 1.0;\n";
4913         epilogue << "   gl_TessLevelOuter[3] = 1.0;\n";
4914         epilogue << "   gl_TessLevelInner[0] = 1.0;\n";
4915         epilogue << "   gl_TessLevelInner[1] = 1.0;\n";
4916     }
4917     else if (m_data.shaderStage == VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT)
4918     {
4919         globals << "// push_constant::width is an invocation count when processing a quad for a single patch\n";
4920         globals << "uint invocationIndex() { return ((((gl_PrimitiveID * width) / gl_SubgroupSize) * gl_SubgroupSize) "
4921                    "+ gl_SubgroupInvocationID); }\n";
4922 
4923         epilogue << "   float u = gl_TessCoord.x;\n";
4924         epilogue << "   float v = gl_TessCoord.y;\n";
4925         epilogue << "   float w = gl_TessCoord.z;\n";
4926         epilogue << "   vec4 p0 = vec4(gl_in[0].gl_Position.xy, 0.0, 1.0);\n";
4927         epilogue << "   vec4 p1 = vec4(gl_in[1].gl_Position.xy, 0.0, 1.0);\n";
4928         epilogue << "   vec4 p2 = vec4(gl_in[2].gl_Position.xy, 0.0, 1.0);\n";
4929         epilogue << "   gl_Position = u * p0 + v * p1 + w * p2;\n";
4930     }
4931     else if (m_data.shaderStage == VK_SHADER_STAGE_GEOMETRY_BIT)
4932     {
4933         static const std::string helperRoutinesCode(R"glsl(
4934         uint invocationIndex() { return subgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; }
4935         void storeValue(uint loc, uvec4 value) {
4936             outputC.loc[gl_PrimitiveIDIn] = loc + 1u;
4937             outputB.b[(loc * invocationStride + gl_PrimitiveIDIn) * enableInvocationIndex] = value;
4938         }
4939         void storeValue(uint loc, uint value) { storeValue(loc, uvec4(value, 0, 0, 0)); }
4940         void storeBallot(uint loc) { storeValue(loc, subgroupBallot(true)); }
4941         uvec4 invocationElectBallot() {
4942             uvec4 ballot = uvec4(0);
4943             ballot[gl_SubgroupInvocationID / 32] = (1 << (gl_SubgroupInvocationID % 32));
4944             return ballot;
4945         }
4946         )glsl");
4947 
4948         static const std::string prologueCode(R"glsl(
4949         uint invocationCount = 1u;
4950         invocationCount = subgroupAdd(invocationCount);
4951         uint identity = gl_PrimitiveIDIn + 1u;
4952         uint maxIdentity = subgroupMax(identity);
4953 
4954         if (subgroupElect()) {
4955             subgroupID = atomicAdd(outputP.p[SUBGROUP_ID_OFFSET], 1u);            // [+0]    subgroupID
4956             outputP.p[SUBGROUP_SIZE_OFFSET] = gl_SubgroupSize;                    // [+1]    subgroupSize
4957             atomicAdd(outputP.p[INVOCATION_COUNT_OFFSET], invocationCount);        // [+2]    invocationCount
4958             atomicMax(outputP.p[MAX_IDENTITY_OFFSET], maxIdentity);
4959         }
4960         subgroupID = subgroupBroadcastFirst(subgroupID);
4961 
4962         outputP.p[gl_PrimitiveIDIn + INVOCATION_ENTRY_OFFSET] = ((subgroupID + 1) << 16) | gl_SubgroupInvocationID;
4963 
4964         )glsl");
4965 
4966         static const std::string epilogueCode(R"glsl(
4967         uint maxLoc = subgroupMax(outLoc);
4968         atomicMax(outputP.p[MAX_LOC_OFFSET], maxLoc);
4969 
4970         gl_Position = gl_in[gl_PrimitiveIDIn].gl_Position;
4971         gl_PrimitiveID = gl_PrimitiveIDIn;
4972 
4973         EmitVertex();
4974         EndPrimitive();
4975         )glsl");
4976 
4977         header << "#extension GL_KHR_shader_subgroup_arithmetic : enable\n";
4978         header << "#define SUBGROUP_ID_OFFSET       0\n";
4979         header << "#define SUBGROUP_SIZE_OFFSET     1\n";
4980         header << "#define INVOCATION_COUNT_OFFSET  2\n";
4981         header << "#define MAX_LOC_OFFSET           3\n";
4982         header << "#define MAX_IDENTITY_OFFSET      4\n";
4983         header << "#define INVOCATION_ENTRY_OFFSET  5\n";
4984 
4985         globals << "uint subgroupID;\n";
4986         globals << "uint numSubgroups;\n";
4987         globals << helperRoutinesCode;
4988 
4989         prologue << prologueCode;
4990         epilogue << epilogueCode;
4991     }
4992 
4993     std::stringstream css, functions, main;
4994     m_program->printCode(functions, main);
4995 
4996     css << header.str();
4997     css << layout.str();
4998     css << globals.str();
4999 
5000     css << functions.str() << "\n\n";
5001 
5002     css << "void main()\n"
5003         << (m_data.isSUCF() ? "[[subgroup_uniform_control_flow]]\n" : "")
5004         << (m_data.testType == TT_MAXIMAL ? "[[maximally_reconverges]]\n" : "") << "{\n";
5005 
5006     css << prologue.str() << "\n";
5007     css << main.str() << "\n\n";
5008     css << epilogue.str() << "\n";
5009 
5010     css << "}\n";
5011 
5012     const vk::ShaderBuildOptions buildOptions(programCollection.usedVulkanVersion, vk::SPIRV_VERSION_1_3, 0u);
5013 
5014     auto &testingShader = programCollection.glslSources.add("test");
5015     switch (m_data.shaderStage)
5016     {
5017     case VK_SHADER_STAGE_COMPUTE_BIT:
5018         testingShader << glu::ComputeSource(css.str()) << buildOptions;
5019         break;
5020     case VK_SHADER_STAGE_FRAGMENT_BIT:
5021         testingShader << glu::FragmentSource(css.str()) << buildOptions;
5022         programCollection.glslSources.add("vert") << glu::VertexSource(genPassThroughVertexSource()) << buildOptions;
5023         programCollection.glslSources.add("aux") << glu::FragmentSource(aux.str()) << buildOptions;
5024         break;
5025     case VK_SHADER_STAGE_VERTEX_BIT:
5026         testingShader << glu::VertexSource(css.str()) << buildOptions;
5027         programCollection.glslSources.add("frag") << glu::FragmentSource(genPassThroughFragmentSource());
5028         break;
5029     case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
5030         testingShader << glu::TessellationControlSource(css.str()) << buildOptions;
5031         programCollection.glslSources.add("vert") << glu::VertexSource(genPassThroughVertexSource());
5032         programCollection.glslSources.add("frag") << glu::FragmentSource(genPassThroughFragmentSource());
5033         programCollection.glslSources.add("tese") << glu::TessellationEvaluationSource(genPassThroughTessEvalSource());
5034         break;
5035     case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
5036         testingShader << glu::TessellationEvaluationSource(css.str()) << buildOptions;
5037         programCollection.glslSources.add("vert") << glu::VertexSource(genPassThroughVertexSource());
5038         programCollection.glslSources.add("frag") << glu::FragmentSource(genPassThroughFragmentSource());
5039         programCollection.glslSources.add("tesc") << glu::TessellationControlSource(genPassThroughTessCtrlSource());
5040         break;
5041     case VK_SHADER_STAGE_GEOMETRY_BIT:
5042         testingShader << glu::GeometrySource(css.str()) << buildOptions;
5043         programCollection.glslSources.add("vert") << glu::VertexSource(genPassThroughVertexSource());
5044         programCollection.glslSources.add("frag") << glu::FragmentSource(genPassThroughFragmentSource());
5045         break;
5046     default:
5047         DE_ASSERT(0);
5048     }
5049 }
5050 
createInstance(Context & context) const5051 TestInstance *ReconvergenceTestCase::createInstance(Context &context) const
5052 {
5053     switch (m_data.shaderStage)
5054     {
5055     case VK_SHADER_STAGE_COMPUTE_BIT:
5056         return new ReconvergenceTestComputeInstance(context, m_data, m_program, std::move(m_subgroupSizeToMaxLoc));
5057     case VK_SHADER_STAGE_FRAGMENT_BIT:
5058         return new ReconvergenceTestFragmentInstance(context, m_data);
5059     case VK_SHADER_STAGE_VERTEX_BIT:
5060         return new ReconvergenceTestVertexInstance(context, m_data);
5061     case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
5062         return new ReconvergenceTestTessCtrlInstance(context, m_data);
5063     case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
5064         return new ReconvergenceTestTessEvalInstance(context, m_data);
5065     case VK_SHADER_STAGE_GEOMETRY_BIT:
5066         return new ReconvergenceTestGeometryInstance(context, m_data);
5067     default:
5068         DE_ASSERT(false);
5069     }
5070     return nullptr;
5071 }
5072 
iterate(void)5073 tcu::TestStatus ReconvergenceTestComputeInstance::iterate(void)
5074 {
5075     const DeviceInterface &vk            = m_context.getDeviceInterface();
5076     const VkDevice device                = m_context.getDevice();
5077     Allocator &allocator                 = m_context.getDefaultAllocator();
5078     tcu::TestLog &log                    = m_context.getTestContext().getLog();
5079     const VkPhysicalDeviceLimits &limits = m_context.getDeviceProperties().limits;
5080 
5081     const uint32_t invocationStride = m_data.sizeX * m_data.sizeY;
5082 
5083     std::vector<tcu::UVec4> ref;
5084     add_ref<ComputeRandomProgram> program(*m_program);
5085 
5086     uint32_t precalculatedMaxLoc = 0u;
5087     if (auto itPrecalculatedMaxLoc = m_subgroupSizeToMaxLoc.find(m_subgroupSize);
5088         itPrecalculatedMaxLoc != m_subgroupSizeToMaxLoc.end())
5089     {
5090         precalculatedMaxLoc = itPrecalculatedMaxLoc->second;
5091     }
5092     uint32_t maxLoc       = precalculatedMaxLoc ? precalculatedMaxLoc :
5093                                                   program.execute(m_context.getTestContext().getWatchDog(), true,
5094                                                                   m_subgroupSize, 0u, invocationStride, ref, log);
5095     uint32_t shaderMaxLoc = maxLoc;
5096 
5097     // maxLoc is per-invocation. Add one (to make sure no additional writes are done) and multiply by
5098     // the number of invocations
5099     maxLoc++;
5100     maxLoc *= invocationStride;
5101 
5102     // buffer[0] is an input filled with a[i] == i
5103     // buffer[1] is the output
5104     // buffer[2] is the location counts
5105     de::MovePtr<BufferWithMemory> buffers[3];
5106     vk::VkDescriptorBufferInfo bufferDescriptors[3];
5107 
5108     VkDeviceSize sizes[3] = {
5109         invocationStride * sizeof(uint32_t),
5110         maxLoc * sizeof(tcu::UVec4),
5111         invocationStride * sizeof(uint32_t),
5112     };
5113 
5114     for (uint32_t i = 0; i < 3; ++i)
5115     {
5116         if (sizes[i] > limits.maxStorageBufferRange)
5117             TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
5118 
5119         try
5120         {
5121             buffers[i] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
5122                 vk, device, allocator,
5123                 makeBufferCreateInfo(sizes[i], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
5124                                                    VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
5125                 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
5126         }
5127         catch (tcu::ResourceError &)
5128         {
5129             // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
5130             return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
5131                                    "Failed device memory allocation " + de::toString(sizes[i]) + " bytes");
5132         }
5133         bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, sizes[i]);
5134     }
5135 
5136     void *ptrs[3];
5137     for (uint32_t i = 0; i < 3; ++i)
5138     {
5139         ptrs[i] = buffers[i]->getAllocation().getHostPtr();
5140     }
5141     for (uint32_t i = 0; i < sizes[0] / sizeof(uint32_t); ++i)
5142     {
5143         ((uint32_t *)ptrs[0])[i] = i;
5144     }
5145     deMemset(ptrs[1], 0, (size_t)sizes[1]);
5146     deMemset(ptrs[2], 0, (size_t)sizes[2]);
5147 
5148     vk::DescriptorSetLayoutBuilder layoutBuilder;
5149 
5150     layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_data.shaderStage);
5151     layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_data.shaderStage);
5152     layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_data.shaderStage);
5153 
5154     vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
5155 
5156     vk::Unique<vk::VkDescriptorPool> descriptorPool(
5157         vk::DescriptorPoolBuilder()
5158             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 3u)
5159             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
5160     vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
5161 
5162     const VkPushConstantRange pushConstantRange = {
5163         (VkShaderStageFlags)m_data.shaderStage, // VkShaderStageFlags stageFlags;
5164         0u,                                     // uint32_t offset;
5165         sizeof(PushConstant)                    // uint32_t size;
5166     };
5167 
5168     const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = {
5169         VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
5170         nullptr,                                       // pNext
5171         (VkPipelineLayoutCreateFlags)0,
5172         1,                          // setLayoutCount
5173         &descriptorSetLayout.get(), // pSetLayouts
5174         1u,                         // pushConstantRangeCount
5175         &pushConstantRange,         // pPushConstantRanges
5176     };
5177 
5178     flushAlloc(vk, device, buffers[0]->getAllocation());
5179     flushAlloc(vk, device, buffers[1]->getAllocation());
5180     flushAlloc(vk, device, buffers[2]->getAllocation());
5181 
5182     const VkPipelineBindPoint bindPoint = VK_PIPELINE_BIND_POINT_COMPUTE;
5183     const Unique<VkShaderModule> shader(createShaderModule(vk, device, m_context.getBinaryCollection().get("test"), 0));
5184     Move<VkPipelineLayout> pipelineLayout = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
5185     Move<VkPipeline> pipeline             = createComputePipeline(*pipelineLayout, *shader);
5186     const VkQueue queue                   = m_context.getUniversalQueue();
5187     Move<VkCommandPool> cmdPool     = createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
5188                                                         m_context.getUniversalQueueFamilyIndex());
5189     Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
5190 
5191     vk::DescriptorSetUpdateBuilder setUpdateBuilder;
5192     setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(0),
5193                                  VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[0]);
5194     setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(1),
5195                                  VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[1]);
5196     setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(2),
5197                                  VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[2]);
5198     setUpdateBuilder.update(vk, device);
5199 
5200     PushConstant pc{/* pcinvocationStride is initialized with 0, the rest of fields as well */};
5201 
5202     // compute "maxLoc", the maximum number of locations written
5203     beginCommandBuffer(vk, *cmdBuffer, 0u);
5204     vk.cmdBindDescriptorSets(*cmdBuffer, bindPoint, *pipelineLayout, 0u, 1, &*descriptorSet, 0u, nullptr);
5205     vk.cmdBindPipeline(*cmdBuffer, bindPoint, *pipeline);
5206     vk.cmdPushConstants(*cmdBuffer, *pipelineLayout, m_data.shaderStage, 0, sizeof(pc), &pc);
5207     vk.cmdDispatch(*cmdBuffer, 1, 1, 1);
5208     endCommandBuffer(vk, *cmdBuffer);
5209 
5210     submitCommandsAndWait(vk, device, queue, cmdBuffer.get());
5211 
5212     invalidateAlloc(vk, device, buffers[1]->getAllocation());
5213     invalidateAlloc(vk, device, buffers[2]->getAllocation());
5214 
5215     // Take the max over all invocations. Add one (to make sure no additional writes are done) and multiply by
5216     // the number of invocations
5217     uint32_t newMaxLoc = 0;
5218     for (uint32_t id = 0; id < invocationStride; ++id)
5219         newMaxLoc = de::max(newMaxLoc, ((uint32_t *)ptrs[2])[id]);
5220     shaderMaxLoc = newMaxLoc;
5221     newMaxLoc++;
5222     newMaxLoc *= invocationStride;
5223 
5224     // If we need more space, reallocate buffers[1]
5225     if (newMaxLoc > maxLoc)
5226     {
5227         maxLoc   = newMaxLoc;
5228         sizes[1] = maxLoc * sizeof(tcu::UVec4);
5229 
5230         if (sizes[1] > limits.maxStorageBufferRange)
5231             TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
5232 
5233         try
5234         {
5235             buffers[1] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
5236                 vk, device, allocator,
5237                 makeBufferCreateInfo(sizes[1], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
5238                                                    VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
5239                 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
5240         }
5241         catch (tcu::ResourceError &)
5242         {
5243             // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
5244             return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
5245                                    "Failed device memory allocation " + de::toString(sizes[1]) + " bytes");
5246         }
5247         bufferDescriptors[1] = makeDescriptorBufferInfo(**buffers[1], 0, sizes[1]);
5248         ptrs[1]              = buffers[1]->getAllocation().getHostPtr();
5249 
5250         vk::DescriptorSetUpdateBuilder setUpdateBuilder2;
5251         setUpdateBuilder2.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(1),
5252                                       VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[1]);
5253         setUpdateBuilder2.update(vk, device);
5254     }
5255 
5256     // Clear any writes to buffer[1] during the counting pass
5257     deMemset(ptrs[1], 0, (size_t)sizes[1]);
5258     flushAlloc(vk, device, buffers[1]->getAllocation());
5259     // Clear any writes to buffer[2] during the counting pass
5260     deMemset(ptrs[2], 0, (size_t)sizes[2]);
5261     flushAlloc(vk, device, buffers[2]->getAllocation());
5262 
5263     // change invocationStride value in shader
5264     pc.invocationStride = invocationStride;
5265 
5266     // run the actual shader
5267     beginCommandBuffer(vk, *cmdBuffer, 0u);
5268     vk.cmdBindDescriptorSets(*cmdBuffer, bindPoint, *pipelineLayout, 0u, 1, &*descriptorSet, 0u, nullptr);
5269     vk.cmdBindPipeline(*cmdBuffer, bindPoint, *pipeline);
5270     vk.cmdPushConstants(*cmdBuffer, *pipelineLayout, m_data.shaderStage, 0, sizeof(pc), &pc);
5271     vk.cmdDispatch(*cmdBuffer, 1, 1, 1);
5272     endCommandBuffer(vk, *cmdBuffer);
5273 
5274     submitCommandsAndWait(vk, device, queue, cmdBuffer.get());
5275 
5276     invalidateAlloc(vk, device, buffers[1]->getAllocation());
5277 
5278     // Simulate execution on the CPU, and compare against the GPU result
5279     try
5280     {
5281         ref.resize(maxLoc, tcu::UVec4());
5282     }
5283     catch (const std::bad_alloc &)
5284     {
5285         // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
5286         return tcu::TestStatus(QP_TEST_RESULT_NOT_SUPPORTED,
5287                                "Failed system memory allocation " + de::toString(maxLoc * sizeof(uint64_t)) + " bytes");
5288     }
5289 
5290     program.execute(m_context.getTestContext().getWatchDog(), false, m_subgroupSize, 0u, invocationStride, ref, log);
5291 
5292     const tcu::UVec4 *result = (const tcu::UVec4 *)ptrs[1];
5293 
5294     qpTestResult res = calculateAndLogResult(result, ref, invocationStride, m_subgroupSize, shaderMaxLoc);
5295 
5296     return tcu::TestStatus(res, qpGetTestResultName(res));
5297 }
5298 
calculateAndLogResult(const tcu::UVec4 * result,const std::vector<tcu::UVec4> & ref,uint32_t invocationStride,uint32_t subgroupSize,uint32_t shaderMaxLoc)5299 qpTestResult_e ReconvergenceTestComputeInstance::calculateAndLogResult(const tcu::UVec4 *result,
5300                                                                        const std::vector<tcu::UVec4> &ref,
5301                                                                        uint32_t invocationStride, uint32_t subgroupSize,
5302                                                                        uint32_t shaderMaxLoc)
5303 {
5304     const uint32_t maxLoc = static_cast<uint32_t>(ref.size());
5305     tcu::TestLog &log     = m_context.getTestContext().getLog();
5306     qpTestResult res      = QP_TEST_RESULT_PASS;
5307     DE_ASSERT(subgroupSize * shaderMaxLoc <= maxLoc);
5308     DE_UNREF(shaderMaxLoc);
5309 
5310     uint32_t mismatchCount            = 0u;
5311     const uint32_t printMismatchCount = 5u;
5312     if (m_data.testType == TT_MAXIMAL)
5313     {
5314         // With maximal reconvergence, we should expect the output to exactly match
5315         // the reference.
5316         for (uint32_t i = 0; i < maxLoc; ++i)
5317         {
5318             const Ballot resultVal(result[i], subgroupSize);
5319             const Ballot refVal(ref[i], subgroupSize);
5320             if (resultVal != refVal)
5321             {
5322                 res = QP_TEST_RESULT_FAIL;
5323                 if (mismatchCount++ < printMismatchCount)
5324                 {
5325                     log << tcu::TestLog::Message << "Mismatch at " << i << "\nexpected: " << resultVal
5326                         << "\n     got: " << refVal << tcu::TestLog::EndMessage;
5327                 }
5328                 else
5329                     break;
5330             }
5331         }
5332 
5333 #if 0 // This log can be large and slow, ifdef it out by default
5334         log << tcu::TestLog::Message << "subgroupSize:" << subgroupSize << ", invocationStride:" << invocationStride << ", maxLoc:" << shaderMaxLoc << tcu::TestLog::EndMessage;
5335         uint32_t invMax = std::min(invocationStride, 50u);
5336         for (uint32_t inv = 0; inv < invMax; ++inv)
5337         {
5338             auto ll = log << tcu::TestLog::Message;
5339             ll << inv << ": ";
5340             for (uint32_t loc = 0; loc < shaderMaxLoc; ++loc)
5341             {
5342                 uint64_t entry = result[loc * invocationStride + inv];
5343                 ll << de::toString(loc) << ":" << tcu::toHex(entry) << ' ';
5344             }
5345             ll << tcu::TestLog::EndMessage;
5346         }
5347 #endif
5348 
5349         if (res != QP_TEST_RESULT_PASS)
5350         {
5351             for (uint32_t i = 0; i < maxLoc; ++i)
5352             {
5353 #if 0
5354                 // This log can be large and slow, ifdef it out by default
5355                 const Ballot resultVal(result[i], subgroupSize);
5356                 const Ballot refVal(ref[i], subgroupSize);
5357                 log << tcu::TestLog::Message << "result " << i << "(" << (i / invocationStride) << ", " << (i % invocationStride) << "): " << resultVal << " ref " << refVal << (resultVal != refVal ? " different" : "") << tcu::TestLog::EndMessage;
5358 #endif
5359             }
5360         }
5361     }
5362     else
5363     {
5364         DE_ASSERT(subgroupSize != 0);
5365 
5366         Ballot fullMask = subgroupSizeToMask(subgroupSize, 0 /* ignored */);
5367         // For subgroup_uniform_control_flow, we expect any fully converged outputs in the reference
5368         // to have a corresponding fully converged output in the result. So walk through each lane's
5369         // results, and for each reference value of fullMask, find a corresponding result value of
5370         // fullMask where the previous value (OP_STORE) matches. That means these came from the same
5371         // source location.
5372         vector<uint32_t> firstFail(invocationStride, 0);
5373         for (uint32_t lane = 0; lane < invocationStride; ++lane)
5374         {
5375             uint32_t resLoc = lane + invocationStride, refLoc = lane + invocationStride;
5376             while (refLoc < maxLoc)
5377             {
5378                 while (refLoc < maxLoc && ref[refLoc] != fullMask)
5379                     refLoc += invocationStride;
5380                 if (refLoc >= maxLoc)
5381                     break;
5382 
5383                 // For TT_SUCF_ELECT, when the reference result has a full mask, we expect lane 0 to be elected
5384                 // (a value of 2) and all other lanes to be not elected (a value of 1). For TT_SUCF_BALLOT, we
5385                 // expect a full mask. Search until we find the expected result with a matching store value in
5386                 // the previous result.
5387                 Ballot expectedResult = m_data.isElect() ? Ballot((lane % m_subgroupSize) == 0 ? 2 : 1) : fullMask;
5388 
5389                 while (resLoc < maxLoc && !(result[resLoc] == expectedResult &&
5390                                             result[resLoc - invocationStride] == ref[refLoc - invocationStride]))
5391                     resLoc += invocationStride;
5392 
5393                 // If we didn't find this output in the result, flag it as an error.
5394                 if (resLoc >= maxLoc)
5395                 {
5396                     firstFail[lane] = refLoc;
5397                     log << tcu::TestLog::Message << "lane " << lane << " first mismatch at " << firstFail[lane]
5398                         << tcu::TestLog::EndMessage;
5399                     res = QP_TEST_RESULT_FAIL;
5400                     break;
5401                 }
5402                 refLoc += invocationStride;
5403                 resLoc += invocationStride;
5404             }
5405         }
5406 
5407         if (res != QP_TEST_RESULT_PASS)
5408         {
5409             for (uint32_t i = 0; i < maxLoc; ++i)
5410             {
5411                 // This log can be large and slow, ifdef it out by default
5412 #if 0
5413                 log << tcu::TestLog::Message << "result " << i << "(" << (i / invocationStride) << ", " << (i % invocationStride) << "): " << tcu::toHex(result[i]) << " ref " << tcu::toHex(ref[i]) << (i == firstFail[i % invocationStride] ? " first fail" : "") << tcu::TestLog::EndMessage;
5414 #endif
5415             }
5416         }
5417     }
5418 
5419     return res;
5420 }
5421 
makeRenderPassBeginInfo(const VkRenderPass renderPass,const VkFramebuffer framebuffer)5422 VkRenderPassBeginInfo ReconvergenceTestGraphicsInstance::makeRenderPassBeginInfo(const VkRenderPass renderPass,
5423                                                                                  const VkFramebuffer framebuffer)
5424 {
5425     static const VkClearValue clearValue{{{0u, 0u, 0u, 0u}}};
5426     return {
5427         VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, // VkStructureType sType;
5428         nullptr,                                  // const void* pNext;
5429         renderPass,                               // VkRenderPass renderPass;
5430         framebuffer,                              // VkFramebuffer framebuffer;
5431         makeRect2D(m_data.sizeX, m_data.sizeY),   // VkRect2D renderArea;
5432         1u,                                       // uint32_t clearValueCount;
5433         &clearValue                               // const VkClearValue* pClearValues;
5434     };
5435 }
5436 
createVertexBufferAndFlush(uint32_t cellsHorz,uint32_t cellsVert,VkPrimitiveTopology topology)5437 de::MovePtr<BufferWithMemory> ReconvergenceTestGraphicsInstance::createVertexBufferAndFlush(
5438     uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
5439 {
5440     uint32_t vertexCount   = cellsHorz * cellsVert;
5441     uint32_t triangleCount = cellsHorz * cellsVert;
5442     switch (topology)
5443     {
5444     case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
5445         vertexCount = triangleCount * 3;
5446         break;
5447     case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
5448         vertexCount = triangleCount - 1 + 3;
5449         break;
5450     case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
5451     case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
5452         triangleCount = vertexCount - 3 + 1;
5453         break;
5454     default:
5455         DE_ASSERT(0);
5456     }
5457 
5458     const DeviceInterface &vk            = m_context.getDeviceInterface();
5459     const VkDevice device                = m_context.getDevice();
5460     Allocator &allocator                 = m_context.getDefaultAllocator();
5461     const VkDeviceSize bufferSize        = VkDeviceSize(vertexCount) * sizeof(Vertex);
5462     const VkBufferUsageFlags bufferUsage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
5463     const VkBufferCreateInfo createInfo  = makeBufferCreateInfo(bufferSize, bufferUsage);
5464     const MemoryRequirement memoryReqs   = (MemoryRequirement::HostVisible | MemoryRequirement::Coherent);
5465     de::MovePtr<BufferWithMemory> buffer(new BufferWithMemory(vk, device, allocator, createInfo, memoryReqs));
5466     Allocation &allocation = buffer->getAllocation();
5467     Vertex *vertices       = static_cast<Vertex *>(allocation.getHostPtr());
5468 
5469     if (VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST == topology)
5470     {
5471         const float stepX = 2.0f / float(cellsHorz);
5472         const float stepY = 2.0f / float(cellsVert);
5473 
5474         uint32_t t = 0;
5475         float y    = -1.0f;
5476         for (uint32_t h = 0; h < cellsVert; ++h)
5477         {
5478             float x        = -1.0f;
5479             const float yy = y + stepY;
5480             for (uint32_t w = 0; w < cellsHorz; ++w)
5481             {
5482                 const float xx = x + stepX;
5483 
5484                 vertices[t++] = {x, yy, 0.f, 0.f};
5485                 vertices[t++] = {((xx + x) / 2.f), y, 0.f, 0.f};
5486                 vertices[t++] = {xx, ((yy + y) / 2.f), 0.f, 0.f};
5487 
5488                 x = xx;
5489             }
5490             y = yy;
5491         }
5492         DE_ASSERT(vertexCount == t);
5493     }
5494     else
5495     {
5496         const uint32_t div = static_cast<uint32_t>(ROUNDUP(triangleCount, 2) / 2);
5497         const float step   = 2.0f / static_cast<float>(div);
5498 
5499         uint32_t t = 0;
5500         float x    = -1.0f;
5501         for (uint32_t i = 0; i < div; ++i)
5502         {
5503             const bool last   = ((div - i) == 1u);
5504             const float xNext = last ? +1.0f : (x + step);
5505 
5506             const Vertex v0{x, +1.0f, 0.0f, 0.0f};
5507             const Vertex v1{xNext, +1.0f, 0.0f, 0.0f};
5508             const Vertex v2{xNext, -1.0f, 0.0f, 0.0f};
5509             const Vertex v3{x, -1.0f, 0.0f, 0.0f};
5510 
5511             if (t == 0)
5512             {
5513                 vertices[0] = v0;
5514                 vertices[1] = v3;
5515                 vertices[2] = v1;
5516 
5517                 t = 3;
5518             }
5519             else
5520             {
5521                 vertices[t++] = v1;
5522             }
5523 
5524             if (!last || !(triangleCount % 2))
5525             {
5526                 vertices[t++] = v2;
5527             }
5528 
5529             x += step;
5530         }
5531         DE_ASSERT(vertexCount == t);
5532     }
5533 
5534     flushAlloc(vk, device, allocation);
5535     return buffer;
5536 }
generateVertices(const uint32_t primitiveCount,const VkPrimitiveTopology topology,const uint32_t patchSize)5537 std::vector<tcu::Vec4> ReconvergenceTestGraphicsInstance::generateVertices(const uint32_t primitiveCount,
5538                                                                            const VkPrimitiveTopology topology,
5539                                                                            const uint32_t patchSize)
5540 {
5541     auto cast     = [](const float f) -> float { return ((f * 2.0f) - 1.0f); };
5542     auto bestRect = [](const uint32_t c) -> std::pair<uint32_t, uint32_t>
5543     {
5544         uint32_t a = 1;
5545         uint32_t b = 1;
5546         do
5547         {
5548             a = a + 1;
5549             b = (c / a) + ((c % a) ? 1 : 0);
5550         } while (a < b);
5551         return {a, b};
5552     };
5553 
5554     uint32_t triangleCount = 0;
5555     uint32_t vertexCount   = 0;
5556     switch (topology)
5557     {
5558     case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
5559         triangleCount = primitiveCount;
5560         vertexCount   = triangleCount + 3 - 1;
5561         break;
5562     case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
5563         triangleCount = primitiveCount;
5564         vertexCount   = triangleCount * 3;
5565         break;
5566     case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
5567         vertexCount = primitiveCount;
5568         break;
5569     case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
5570         vertexCount   = primitiveCount * patchSize;
5571         triangleCount = ROUNDUP(vertexCount, 3) / 3;
5572         break;
5573     default:
5574         DE_ASSERT(false);
5575     }
5576 
5577     if (3 == vertexCount)
5578     {
5579         return {{-1.0f, +1.0f, 0.0f, 1.0f}, {0.0f, -1.0f, 0.0f, 1.0f}, {+1.0f, +1.0f, 0.0f, 1.0f}};
5580     }
5581 
5582     std::vector<tcu::Vec4> vertices(vertexCount);
5583 
5584     if (VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP == topology)
5585     {
5586         uint32_t v         = 0;
5587         const uint32_t div = ROUNDUP(triangleCount, 2) / 2;
5588 
5589         for (uint32_t i = 0; i < triangleCount && v < vertexCount; ++i)
5590         {
5591             const float xx = cast(float((i / 2) + 1) / float(div));
5592             if (0 == i)
5593             {
5594                 const float x = cast(float(i / 2) / float(div));
5595                 vertices[v++] = {x, +1.0f, 0.0f, 1.0f};
5596                 vertices[v++] = {x, -1.0f, 0.0f, 1.0f};
5597                 vertices[v++] = {xx, +1.0f, 0.0f, 1.0f};
5598             }
5599             else
5600             {
5601                 if (i % 2)
5602                     vertices[v++] = {xx, -1.0f, 0.0f, 1.0f};
5603                 else
5604                     vertices[v++] = {xx, +1.0f, 0.0f, 1.0f};
5605             }
5606         }
5607         DE_ASSERT(vertexCount == v);
5608     }
5609     else if (VK_PRIMITIVE_TOPOLOGY_POINT_LIST == topology)
5610     {
5611         uint32_t v      = 0;
5612         const auto rect = bestRect(vertexCount);
5613 
5614         float y = -1.0f;
5615         for (uint32_t h = 0; h < rect.second; ++h)
5616         {
5617             const float yy = cast(float(h + 1) / float(rect.second));
5618             float x        = -1.0f;
5619             for (uint32_t w = 0; w < rect.first && v < vertexCount; ++w)
5620             {
5621                 const float xx = cast(float(w + 1) / float(rect.first));
5622                 vertices[v++]  = {((xx - x) / 2.0f), ((yy - y) / 2.0f), 0.0f, 1.0f};
5623                 x              = xx;
5624             }
5625             y = yy;
5626         }
5627         DE_ASSERT(vertexCount == v);
5628     }
5629     else if (VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST == topology || VK_PRIMITIVE_TOPOLOGY_PATCH_LIST == topology)
5630     {
5631         uint32_t v      = 0;
5632         const auto rect = bestRect(triangleCount);
5633 
5634         float y = -1.0f;
5635         for (uint32_t h = 0; h < rect.second && v < vertexCount; ++h)
5636         {
5637             const float yy = cast(float(h + 1) / float(rect.second));
5638             float x        = -1.0f;
5639             for (uint32_t w = 0; w < rect.first && v < vertexCount; ++w)
5640             {
5641                 const float xx = cast(float(w + 1) / float(rect.first));
5642                 if (v < vertexCount)
5643                     vertices[v++] = {x, yy, 0.f, 0.f};
5644                 if (v < vertexCount)
5645                     vertices[v++] = {((xx + x) / 2.f), y, 0.f, 0.f};
5646                 if (v < vertexCount)
5647                     vertices[v++] = {xx, ((yy + y) / 2.f), 0.f, 0.f};
5648                 x = xx;
5649             }
5650             y = yy;
5651         }
5652         DE_ASSERT(vertexCount == v);
5653     }
5654 
5655     return vertices;
5656 }
5657 
createVertexBufferAndFlush(const std::vector<tcu::Vec4> & vertices)5658 de::MovePtr<BufferWithMemory> ReconvergenceTestGraphicsInstance::createVertexBufferAndFlush(
5659     const std::vector<tcu::Vec4> &vertices)
5660 {
5661     const DeviceInterface &vk            = m_context.getDeviceInterface();
5662     const VkDevice device                = m_context.getDevice();
5663     Allocator &allocator                 = m_context.getDefaultAllocator();
5664     const VkDeviceSize bufferSize        = VkDeviceSize(vertices.size()) * sizeof(tcu::Vec4);
5665     const VkBufferUsageFlags bufferUsage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
5666     const VkBufferCreateInfo createInfo  = makeBufferCreateInfo(bufferSize, bufferUsage);
5667     const MemoryRequirement memoryReqs   = (MemoryRequirement::HostVisible | MemoryRequirement::Coherent);
5668     de::MovePtr<BufferWithMemory> buffer(new BufferWithMemory(vk, device, allocator, createInfo, memoryReqs));
5669     Allocation &allocation = buffer->getAllocation();
5670     auto bufferRange       = makeStdBeginEnd<tcu::Vec4>(allocation.getHostPtr(), (uint32_t)vertices.size());
5671     std::copy(vertices.begin(), vertices.end(), bufferRange.first);
5672     flushAlloc(vk, device, allocation);
5673     return buffer;
5674 }
5675 
recordDrawingAndSubmit(const VkCommandBuffer cmdBuffer,const VkPipelineLayout pipelineLayout,const VkPipeline pipeline,const VkDescriptorSet descriptorSet,const PushConstant & pushConstant,const VkRenderPassBeginInfo & renderPassInfo,const VkBuffer vertexBuffer,const uint32_t vertexCount,const VkImage image)5676 void ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit(
5677     const VkCommandBuffer cmdBuffer, const VkPipelineLayout pipelineLayout, const VkPipeline pipeline,
5678     const VkDescriptorSet descriptorSet, const PushConstant &pushConstant, const VkRenderPassBeginInfo &renderPassInfo,
5679     const VkBuffer vertexBuffer, const uint32_t vertexCount, const VkImage image)
5680 {
5681     DE_UNREF(image);
5682     const DeviceInterface &vk           = m_context.getDeviceInterface();
5683     const VkDevice device               = m_context.getDevice();
5684     const VkQueue queue                 = m_context.getUniversalQueue();
5685     const VkPipelineBindPoint bindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS;
5686 
5687     beginCommandBuffer(vk, cmdBuffer, 0u);
5688     vk.cmdBindDescriptorSets(cmdBuffer, bindPoint, pipelineLayout, 0u, 1u, &descriptorSet, 0u, nullptr);
5689     vk.cmdBindPipeline(cmdBuffer, bindPoint, pipeline);
5690     vk.cmdBindVertexBuffers(cmdBuffer, 0u, 1u, &static_cast<const VkBuffer &>(vertexBuffer),
5691                             &static_cast<const VkDeviceSize &>(0u));
5692     vk.cmdPushConstants(cmdBuffer, pipelineLayout, m_data.shaderStage, 0, sizeof(PushConstant), &pushConstant);
5693     vk.cmdBeginRenderPass(cmdBuffer, &renderPassInfo, VK_SUBPASS_CONTENTS_INLINE);
5694     vk.cmdDraw(cmdBuffer, vertexCount, 1u, 0u, 0u);
5695     vk.cmdEndRenderPass(cmdBuffer);
5696     endCommandBuffer(vk, cmdBuffer);
5697 
5698     submitCommandsAndWait(vk, device, queue, cmdBuffer);
5699 }
5700 
createShaders(void)5701 std::vector<Move<VkShaderModule>> ReconvergenceTestFragmentInstance::createShaders(void)
5702 {
5703     const DeviceInterface &vk = m_context.getDeviceInterface();
5704     const VkDevice device     = m_context.getDevice();
5705 
5706     Move<VkShaderModule> vertex   = createShaderModule(vk, device, m_context.getBinaryCollection().get("vert"), 0);
5707     Move<VkShaderModule> fragment = createShaderModule(vk, device, m_context.getBinaryCollection().get("test"), 0);
5708 
5709     // { #vert, #frag, tesc, tese, geom }; if any
5710     std::vector<Move<VkShaderModule>> shaders;
5711     shaders.emplace_back(vertex);
5712     shaders.emplace_back(fragment);
5713 
5714     return shaders;
5715 }
5716 
calculateAndLogResult(const uint64_t * result,const std::vector<uint64_t> & ref,uint32_t invocationStride,uint32_t subgroupSize,uint32_t shaderMaxLocs,uint32_t primitiveCount,PrintMode printMode)5717 qpTestResult_e ReconvergenceTestGraphicsInstance::calculateAndLogResult(const uint64_t *result,
5718                                                                         const std::vector<uint64_t> &ref,
5719                                                                         uint32_t invocationStride,
5720                                                                         uint32_t subgroupSize, uint32_t shaderMaxLocs,
5721                                                                         uint32_t primitiveCount, PrintMode printMode)
5722 {
5723     DE_ASSERT(m_data.testType == TT_MAXIMAL);
5724 
5725     const uint32_t maxLoc  = static_cast<uint32_t>(ref.size());
5726     tcu::TestLog &log      = m_context.getTestContext().getLog();
5727     qpTestResult res       = QP_TEST_RESULT_PASS;
5728     uint32_t mismatchCount = 0;
5729 
5730     DE_ASSERT(shaderMaxLocs * invocationStride <= maxLoc);
5731 
5732     // With maximal reconvergence, we should expect the output to exactly match
5733     // the reference.
5734     for (uint32_t i = 0; i < maxLoc; ++i)
5735     {
5736         const uint64_t resultVal = result[i];
5737         const uint64_t refVal    = ref[i];
5738         if (resultVal != refVal)
5739         {
5740             if (1 > mismatchCount++)
5741             {
5742                 log << tcu::TestLog::Message << mismatchCount << ": Mismatch at " << i
5743                     << ", res: " << tcu::toHex(resultVal) << ", ref: " << tcu::toHex(refVal)
5744                     << tcu::TestLog::EndMessage;
5745             }
5746         }
5747     }
5748 
5749     if (PrintMode::None != printMode)
5750     {
5751         log << tcu::TestLog::Message << "deviceSubgroupSize: " << m_subgroupSize
5752             << ", testSubgroupSize: " << subgroupSize << ", invocationStride: " << invocationStride
5753             << ", shaderMaxLocs: " << shaderMaxLocs << "\n\t, framebuffer: " << m_data.sizeX << 'x' << m_data.sizeY
5754             << ", primitiveCount: " << primitiveCount << ", PRINT_MODE: "
5755             << ((PrintMode::ThreadsInColumns == printMode) ?
5756                     "\"ouLocs in rows & threads in columns\"" :
5757                     ((PrintMode::OutLocsInColumns == printMode) ? "\"threads in rows & outLocs in columns\"" : ""))
5758             << " { id:res,ref }\n"
5759             << tcu::TestLog::EndMessage;
5760     }
5761 
5762     uint32_t invMax = std::min(invocationStride, 80u);
5763 
5764     if (PrintMode::ThreadsInColumns == printMode)
5765     {
5766         for (uint32_t loc = 0; loc < shaderMaxLocs; ++loc)
5767         {
5768             auto l1 = log << tcu::TestLog::Message;
5769             l1 << "loc " << std::setw(3) << loc << ": ";
5770             for (uint32_t inv = 0; inv < invMax; ++inv)
5771             {
5772                 uint32_t idx = loc * invocationStride + inv;
5773                 DE_ASSERT(idx < maxLoc);
5774                 uint64_t resEntry = result[idx];
5775                 uint64_t refEntry = ref[idx];
5776                 //l1 << de::toString(inv) << ':' << tcu::toHex(resEntry) << ',' << tcu::toHex(refEntry) << ' ';
5777                 l1 << std::dec << inv << ':' << std::setw(subgroupSize / 4) << std::hex << resEntry << ','
5778                    << std::setw(subgroupSize / 4) << std::hex << refEntry << std::dec << ' ';
5779             }
5780             l1 << std::setw(0) << tcu::TestLog::EndMessage;
5781         }
5782     }
5783     else if (PrintMode::OutLocsInColumns == printMode)
5784     {
5785         for (uint32_t inv = 0; inv < invMax; ++inv)
5786         {
5787             auto l1 = log << tcu::TestLog::Message;
5788             l1 << "res " << std::setw(3) << inv << ": ";
5789             for (uint32_t loc = 0; loc < shaderMaxLocs; ++loc)
5790             {
5791                 uint32_t idx = loc * invocationStride + inv;
5792                 DE_ASSERT(idx < maxLoc);
5793                 uint64_t entry = result[idx];
5794                 l1 << de::toString(loc) << ':' << tcu::toHex(entry) << ' ';
5795             }
5796             l1 << std::setw(0) << tcu::TestLog::EndMessage;
5797 
5798             auto l2 = log << tcu::TestLog::Message;
5799             l2 << "ref " << std::setw(3) << inv << ": ";
5800             for (uint32_t loc = 0; loc < shaderMaxLocs; ++loc)
5801             {
5802                 uint32_t idx = loc * invocationStride + inv;
5803                 DE_ASSERT(idx < maxLoc);
5804                 uint64_t entry = ref[idx];
5805                 l2 << de::toString(loc) << ':' << tcu::toHex(entry) << ' ';
5806             }
5807             l2 << std::setw(0) << tcu::TestLog::EndMessage;
5808         }
5809     }
5810 
5811     if (mismatchCount)
5812     {
5813         double mismatchPercentage = 0.0;
5814         std::modf((double)(mismatchCount * 100) / (double)maxLoc, &mismatchPercentage);
5815         log << tcu::TestLog::Message << "Mismatch count " << mismatchCount << " from " << maxLoc << " ("
5816             << mismatchPercentage << "%)" << tcu::TestLog::EndMessage;
5817         res = QP_TEST_RESULT_FAIL;
5818     }
5819 
5820     if (res != QP_TEST_RESULT_PASS)
5821     {
5822         for (uint32_t i = 0; i < maxLoc; ++i)
5823         {
5824             // This log can be large and slow, ifdef it out by default
5825 #if 0
5826             log << tcu::TestLog::Message << "result " << i << "(" << (i / invocationStride) << ", " << (i % invocationStride) << "): " << tcu::toHex(result[i]) << " ref " << tcu::toHex(ref[i]) << (result[i] != ref[i] ? " different" : "") << tcu::TestLog::EndMessage;
5827 #endif
5828         }
5829     }
5830 
5831     return res;
5832 }
5833 
calculateAndLogResultEx(tcu::TestLog & log,const tcu::UVec4 * result,const std::vector<tcu::UVec4> & ref,const uint32_t maxLoc,const Arrangement & a,const PrintMode printMode)5834 qpTestResult_e ReconvergenceTestFragmentInstance::calculateAndLogResultEx(tcu::TestLog &log, const tcu::UVec4 *result,
5835                                                                           const std::vector<tcu::UVec4> &ref,
5836                                                                           const uint32_t maxLoc, const Arrangement &a,
5837                                                                           const PrintMode printMode)
5838 {
5839     DE_UNREF(printMode);
5840 
5841     qpTestResult res                             = QP_TEST_RESULT_PASS;
5842     uint32_t mismatchCount                       = 0u;
5843     const uint32_t printMismatchCount            = 5u;
5844     const FragmentRandomProgram::Arrangement &aa = static_cast<const FragmentRandomProgram::Arrangement &>(a);
5845 
5846     // With maximal reconvergence, we should expect the output to exactly match
5847     // the reference.
5848     const uint32_t ballotStoreCount = maxLoc * aa.m_invocationStride * aa.m_primitiveStride;
5849     for (uint32_t i = 0; i < ballotStoreCount; ++i)
5850     {
5851         const Ballot resultVal(result[i], aa.m_subgroupSize);
5852         ;
5853         const Ballot refVal(ref[i], aa.m_subgroupSize);
5854         if (resultVal != refVal)
5855         {
5856             if (mismatchCount++ < printMismatchCount)
5857             {
5858                 res = QP_TEST_RESULT_FAIL;
5859                 log << tcu::TestLog::Message << "Mismatch at " << i << "\nexpected: " << resultVal
5860                     << "\n     got: " << refVal << tcu::TestLog::EndMessage;
5861                 if (printMode == PrintMode::Console)
5862                 {
5863                     std::cout << "Mismatch at " << i << "\nexpected: " << resultVal << "\n     got: " << refVal
5864                               << std::endl;
5865                 }
5866             }
5867         }
5868     }
5869 
5870     log << tcu::TestLog::Message << "Mismatch count: " << mismatchCount << " from " << ballotStoreCount
5871         << tcu::TestLog::EndMessage;
5872     if (printMode == PrintMode::Console)
5873     {
5874         std::cout << "Mismatch count: " << mismatchCount << " from " << ballotStoreCount << std::endl;
5875     }
5876 
5877     return res;
5878 }
5879 
makeImageCreateInfo(VkFormat format) const5880 VkImageCreateInfo ReconvergenceTestFragmentInstance::makeImageCreateInfo(VkFormat format) const
5881 {
5882     return {
5883         VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // VkStructureType sType;
5884         nullptr,                             // const void* pNext;
5885         VkImageCreateFlags(0),               // VkImageCreateFlags flags;
5886         VK_IMAGE_TYPE_2D,                    // VkImageType imageType;
5887         format,                              // VkFormat format;
5888         {m_data.sizeX, m_data.sizeY, 1u},    // VkExtent3D extent;
5889         1u,                                  // uint32_t mipLevels;
5890         1u,                                  // uint32_t arrayLayers;
5891         VK_SAMPLE_COUNT_1_BIT,               // VkSampleCountFlagBits samples;
5892         VK_IMAGE_TILING_OPTIMAL,             // VkImageTiling tiling;
5893         VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, // VkImageUsageFlags usage;
5894         VK_SHARING_MODE_EXCLUSIVE,           // VkSharingMode sharingMode;
5895         0u,                                  // uint32_t queueFamilyIndexCount;
5896         0u,                                  // const uint32_t* pQueueFamilyIndices;
5897         VK_IMAGE_LAYOUT_UNDEFINED            // VkImageLayout initialLayout;
5898     };
5899 }
5900 
createVertexBufferAndFlush(uint32_t cellsHorz,uint32_t cellsVert,VkPrimitiveTopology topology)5901 de::MovePtr<BufferWithMemory> ReconvergenceTestFragmentInstance::createVertexBufferAndFlush(
5902     uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
5903 {
5904     // DE_ASSERT(cellsHorz == 2u);
5905     // DE_ASSERT((cellsHorz * 3) == cellsVert);
5906     DE_UNREF(cellsHorz);
5907     DE_UNREF(cellsVert);
5908     DE_UNREF(topology);
5909     DE_ASSERT(topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST);
5910     const std::vector<tcu::Vec4> vertices{{-1.0f, 0.0f, 0.0f, 0.0f},  {-0.5f, -1.0f, 0.0f, 0.0f},
5911                                           {+1.0f, +1.0f, 0.0f, 0.0f}, {+0.5f, -1.0f, 0.0f, 0.0f},
5912                                           {+1.0f, 0.0f, 0.0f, 0.0f},  {-1.0f, +1.0f, 0.0f, 0.0f}};
5913     return ReconvergenceTestGraphicsInstance::createVertexBufferAndFlush(vertices);
5914 }
5915 
callAuxiliaryShader(tcu::TestStatus & status,uint32_t triangleCount)5916 std::vector<uint32_t> ReconvergenceTestFragmentInstance::callAuxiliaryShader(tcu::TestStatus &status,
5917                                                                              uint32_t triangleCount)
5918 {
5919     const DeviceInterface &vk    = m_context.getDeviceInterface();
5920     const VkDevice device        = m_context.getDevice();
5921     add_ref<Allocator> allocator = m_context.getDefaultAllocator();
5922     const uint32_t queueIndex    = m_context.getUniversalQueueFamilyIndex();
5923     //add_ref<tcu::TestLog> log = m_context.getTestContext().getLog();
5924     const uint32_t bufferElems    = m_data.sizeX * m_data.sizeY * triangleCount + 3u;
5925     const VkDeviceSize bufferSize = bufferElems * sizeof(uint32_t);
5926 
5927     if (bufferSize > m_context.getDeviceProperties().limits.maxStorageBufferRange)
5928         TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
5929 
5930     const VkBufferCreateInfo createInfo =
5931         vk::makeBufferCreateInfo(bufferSize, (VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
5932                                               VK_BUFFER_USAGE_TRANSFER_SRC_BIT));
5933     de::MovePtr<BufferWithMemory> buffer;
5934     try
5935     {
5936         buffer = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
5937             vk, device, allocator, createInfo, (MemoryRequirement::HostVisible | MemoryRequirement::Coherent)));
5938     }
5939     catch (tcu::ResourceError &)
5940     {
5941         // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
5942         status = tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
5943                                  "Failed device memory allocation " + de::toString(bufferSize) + " bytes");
5944         return {};
5945     }
5946 
5947     const VkDescriptorBufferInfo bufferInfo = makeDescriptorBufferInfo(**buffer, 0, bufferSize);
5948 
5949     vk::DescriptorSetLayoutBuilder layoutBuilder;
5950     layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_FRAGMENT_BIT);
5951     vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
5952 
5953     vk::DescriptorPoolBuilder poolBuilder;
5954     poolBuilder.addType(vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1u);
5955     vk::Unique<vk::VkDescriptorPool> descriptorPool(
5956         poolBuilder.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
5957 
5958     vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
5959 
5960     vk::DescriptorSetUpdateBuilder setUpdateBuilder;
5961     setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(0),
5962                                  VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferInfo);
5963     setUpdateBuilder.update(vk, device);
5964 
5965     const VkPushConstantRange pushConstantRange{
5966         VK_SHADER_STAGE_FRAGMENT_BIT, // VkShaderStageFlags stageFlags;
5967         0u,                           // uint32_t offset;
5968         sizeof(PushConstant)          // uint32_t size;
5969     };
5970     const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{
5971         VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
5972         nullptr,                                       // pNext
5973         (VkPipelineLayoutCreateFlags)0,                // flags
5974         1u,                                            // setLayoutCount
5975         &descriptorSetLayout.get(),                    // pSetLayouts
5976         1u,                                            // pushConstantRangeCount
5977         &pushConstantRange,                            // pPushConstantRanges
5978     };
5979 
5980     const VkFormat format                   = VK_FORMAT_R8G8B8A8_UNORM;
5981     const VkImageCreateInfo imageCreateInfo = makeImageCreateInfo(format);
5982     const VkImageSubresourceRange rscRange  = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
5983     de::MovePtr<ImageWithMemory> image(
5984         new ImageWithMemory(vk, device, allocator, imageCreateInfo, vk::MemoryRequirement::Any));
5985     Move<VkImageView> view        = makeImageView(vk, device, **image, VK_IMAGE_VIEW_TYPE_2D, format, rscRange);
5986     Move<VkRenderPass> renderPass = makeRenderPass(vk, device, format);
5987     Move<VkFramebuffer> framebuffer =
5988         makeFramebuffer(vk, device, *renderPass, *view, m_data.sizeX, m_data.sizeY, rscRange.layerCount);
5989     const VkRenderPassBeginInfo renderBeginInfo = makeRenderPassBeginInfo(*renderPass, *framebuffer);
5990     auto createAuxShaders                       = [&]()
5991     {
5992         Shaders shaders;
5993         auto vert = createShaderModule(vk, device, m_context.getBinaryCollection().get("vert"), 0);
5994         auto frag = createShaderModule(vk, device, m_context.getBinaryCollection().get("aux"), 0);
5995         shaders.emplace_back(vert);
5996         shaders.emplace_back(frag);
5997         return shaders;
5998     };
5999     const Shaders shaders      = createAuxShaders();
6000     const uint32_t vertexCount = triangleCount * 3u;
6001     de::MovePtr<BufferWithMemory> vertexBuffer =
6002         createVertexBufferAndFlush(triangleCount, vertexCount, VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST);
6003     Move<VkPipelineLayout> pipelineLayout = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
6004     Move<VkPipeline> pipeline = createGraphicsPipeline(*pipelineLayout, *renderPass, m_data.sizeX, m_data.sizeY,
6005                                                        shaders, VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST, 0U);
6006     Move<VkCommandPool> cmdPool =
6007         createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, queueIndex);
6008     Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
6009 
6010     PushConstant pc{};
6011     pc.invocationStride = 0u;
6012     pc.width            = m_data.sizeX;
6013     pc.height           = m_data.sizeY;
6014     pc.primitiveStride  = triangleCount;
6015 
6016     void *ptr        = buffer->getAllocation().getHostPtr();
6017     auto bufferRange = makeStdBeginEnd<uint32_t>(ptr, bufferElems);
6018     std::fill(bufferRange.first, bufferRange.second, 0u);
6019 
6020     std::bind(&ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit, this, *cmdBuffer, *pipelineLayout, *pipeline,
6021               *descriptorSet, std::cref(pc), std::cref(renderBeginInfo), **vertexBuffer, vertexCount, **image)();
6022 
6023     status = tcu::TestStatus::pass(std::string());
6024     return std::vector<uint32_t>(bufferRange.first, bufferRange.second);
6025 }
6026 
iterate(void)6027 tcu::TestStatus ReconvergenceTestFragmentInstance::iterate(void)
6028 {
6029     const DeviceInterface &vk            = m_context.getDeviceInterface();
6030     const VkDevice device                = m_context.getDevice();
6031     add_ref<Allocator> allocator         = m_context.getDefaultAllocator();
6032     const uint32_t queueIndex            = m_context.getUniversalQueueFamilyIndex();
6033     add_ref<tcu::TestLog> log            = m_context.getTestContext().getLog();
6034     const VkPhysicalDeviceLimits &limits = m_context.getDeviceProperties().limits;
6035     const uint32_t fragmentStride        = m_data.sizeX * m_data.sizeY;
6036     const uint32_t primitiveStride       = 2;
6037 
6038     if (sizeof(PushConstant) > limits.maxPushConstantsSize)
6039     {
6040         return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6041                                "PushConstant size " + std::to_string(sizeof(PushConstant)) + " exceeds device limit " +
6042                                    std::to_string(limits.maxPushConstantsSize));
6043     }
6044 
6045     tcu::TestStatus auxStatus(QP_TEST_RESULT_FAIL, std::string());
6046     std::vector<uint32_t> primitiveMap = callAuxiliaryShader(auxStatus, primitiveStride);
6047     if (auxStatus.isFail())
6048         return auxStatus;
6049 
6050     const uint32_t shaderSubgroupSize = primitiveMap.at(fragmentStride * primitiveStride + 1u);
6051     if (shaderSubgroupSize != m_subgroupSize)
6052     {
6053         return tcu::TestStatus(QP_TEST_RESULT_FAIL,
6054                                "The size of the subgroup from the shader (" + std::to_string(shaderSubgroupSize) +
6055                                    ") is different from the size of the subgroup from the device (" +
6056                                    std::to_string(m_subgroupSize) + ")");
6057     }
6058     const uint32_t shaderSubgroupStride = primitiveMap.at(fragmentStride * primitiveStride + 0u);
6059     const uint32_t hostSubgroupStride =
6060         FragmentRandomProgram::Arrangement::calcSubgroupCount(primitiveMap, primitiveStride, fragmentStride);
6061     if (shaderSubgroupStride != hostSubgroupStride)
6062     {
6063         return tcu::TestStatus(QP_TEST_RESULT_FAIL,
6064                                "The number of subgroups from the shader (" + std::to_string(shaderSubgroupStride) +
6065                                    ") is different from the number of subgroups calculated manually (" +
6066                                    std::to_string(hostSubgroupStride) + ")");
6067     }
6068 
6069     log << tcu::TestLog::Message << "Subgroup count: " << hostSubgroupStride << tcu::TestLog::EndMessage;
6070     log << tcu::TestLog::Message << "Subgroup size: " << m_subgroupSize << tcu::TestLog::EndMessage;
6071 
6072     const VkPrimitiveTopology topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
6073     de::MovePtr<BufferWithMemory> vertexBuffer =
6074         createVertexBufferAndFlush(primitiveStride, (primitiveStride * 3u), topology);
6075 
6076     std::vector<tcu::UVec4> ref;
6077     de::MovePtr<FragmentRandomProgram> program = FragmentRandomProgram::create(m_data);
6078     program->generateRandomProgram(m_context.getTestContext().getWatchDog(), log);
6079 
6080     const uint32_t simulationMaxLoc = program->execute(m_context.getTestContext().getWatchDog(), true, m_subgroupSize,
6081                                                        fragmentStride, primitiveStride, ref, log, primitiveMap);
6082     log << tcu::TestLog::Message << "simulated maxLoc: " << simulationMaxLoc << tcu::TestLog::EndMessage;
6083     // maxLoc is per-invocation. Add one (to make sure no additional writes are done)
6084     uint32_t maxLoc = simulationMaxLoc;
6085     maxLoc += 1;
6086     maxLoc *= (hostSubgroupStride * 128u * primitiveStride);
6087 
6088     constexpr uint32_t bufferCount = 4;
6089     enum Bindings
6090     {
6091         InputA,
6092         OutputBallots,
6093         OutputCounts,
6094         OutputPriMap
6095     };
6096 
6097     de::MovePtr<BufferWithMemory> buffers[bufferCount];
6098     vk::VkDescriptorBufferInfo bufferDescriptors[bufferCount];
6099 
6100     VkDeviceSize sizes[bufferCount]{
6101         // InputA  { uint    a[]; } inputA;  filled with a[i] := i
6102         (FragmentRandomProgram::conditionIfInvocationStride + 2) * sizeof(uint32_t),
6103 
6104         // OutputB { uvec4   b[]; } outputB;
6105         maxLoc * sizeof(tcu::UVec4),
6106 
6107         // OutputC { uint loc[]; } outputC;
6108         (hostSubgroupStride * 128u * primitiveStride) * sizeof(uint32_t),
6109 
6110         // OutputP { uvec   p[]; } outputP; few more for calculating subgroupID, subgroupSize, non-helper and helperinvocations
6111         (fragmentStride * primitiveStride + 16u) * sizeof(uint32_t)};
6112 
6113     VkBufferUsageFlags usages[bufferCount]{
6114         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6115         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6116         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6117         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6118     };
6119 
6120     // allocate buffers
6121     for (uint32_t i = 0; i < bufferCount; ++i)
6122     {
6123         if (sizes[i] > limits.maxStorageBufferRange)
6124             TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
6125 
6126         try
6127         {
6128             buffers[i] = de::MovePtr<BufferWithMemory>(
6129                 new BufferWithMemory(vk, device, allocator,
6130                                      makeBufferCreateInfo(sizes[i], usages[i] | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
6131                                                                         VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
6132                                      MemoryRequirement::HostVisible | MemoryRequirement::Cached));
6133         }
6134         catch (tcu::ResourceError &)
6135         {
6136             // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6137             return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6138                                    "Failed device memory allocation " + de::toString(sizes[i]) + " bytes");
6139         }
6140         bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, sizes[i]);
6141     }
6142 
6143     // get raw pointers to previously allocated buffers
6144     void *ptrs[bufferCount];
6145     for (uint32_t i = 0; i < bufferCount; ++i)
6146     {
6147         ptrs[i] = buffers[i]->getAllocation().getHostPtr();
6148     }
6149 
6150     // populate buffers with their destination
6151     {
6152         auto rangeBufferA =
6153             makeStdBeginEnd<uint32_t>(ptrs[InputA], static_cast<uint32_t>(sizes[InputA] / sizeof(uint32_t)));
6154         std::iota(rangeBufferA.first, rangeBufferA.second, 0u);
6155     }
6156     deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
6157     deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
6158     deMemset(ptrs[OutputPriMap], 0, (size_t)sizes[OutputPriMap]);
6159 
6160     // (...) and flush them to the GPU
6161     for (uint32_t i = 0; i < bufferCount; ++i)
6162     {
6163         flushAlloc(vk, device, buffers[i]->getAllocation());
6164     }
6165 
6166     VkDescriptorType descTypes[bufferCount]{
6167         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6168         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6169         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6170         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6171     };
6172 
6173     vk::DescriptorSetLayoutBuilder layoutBuilder;
6174     for (uint32_t i = 0; i < bufferCount; ++i)
6175     {
6176         layoutBuilder.addSingleBinding(descTypes[i], m_data.shaderStage);
6177     }
6178     vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
6179 
6180     vk::DescriptorPoolBuilder poolBuilder;
6181     for (uint32_t i = 0; i < bufferCount; ++i)
6182     {
6183         poolBuilder.addType(descTypes[i], 1);
6184     }
6185     vk::Unique<vk::VkDescriptorPool> descriptorPool(
6186         poolBuilder.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
6187     vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
6188 
6189     vk::DescriptorSetUpdateBuilder setUpdateBuilder;
6190     for (uint32_t i = 0; i < bufferCount; ++i)
6191     {
6192         setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(i), descTypes[i],
6193                                      &bufferDescriptors[i]);
6194     }
6195     setUpdateBuilder.update(vk, device);
6196 
6197     const VkPushConstantRange pushConstantRange{
6198         (VkShaderStageFlags)m_data.shaderStage, // VkShaderStageFlags stageFlags;
6199         0u,                                     // uint32_t offset;
6200         sizeof(PushConstant)                    // uint32_t size;
6201     };
6202 
6203     const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{
6204         VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
6205         nullptr,                                       // pNext
6206         (VkPipelineLayoutCreateFlags)0,                // flags
6207         1u,                                            // setLayoutCount
6208         &descriptorSetLayout.get(),                    // pSetLayouts
6209         1u,                                            // pushConstantRangeCount
6210         &pushConstantRange,                            // pPushConstantRanges
6211     };
6212 
6213     const VkFormat format                   = VK_FORMAT_R8G8B8A8_UNORM;
6214     const VkImageCreateInfo imageCreateInfo = makeImageCreateInfo(format);
6215     const VkImageSubresourceRange rscRange  = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
6216     de::MovePtr<ImageWithMemory> image(
6217         new ImageWithMemory(vk, device, allocator, imageCreateInfo, vk::MemoryRequirement::Any));
6218     Move<VkImageView> view        = makeImageView(vk, device, **image, VK_IMAGE_VIEW_TYPE_2D, format, rscRange);
6219     Move<VkRenderPass> renderPass = makeRenderPass(vk, device, format);
6220     Move<VkFramebuffer> framebuffer =
6221         makeFramebuffer(vk, device, *renderPass, *view, m_data.sizeX, m_data.sizeY, rscRange.layerCount);
6222     const VkRenderPassBeginInfo renderBeginInfo = makeRenderPassBeginInfo(*renderPass, *framebuffer);
6223     const Shaders shaders                       = createShaders();
6224     Move<VkPipelineLayout> pipelineLayout       = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
6225     Move<VkPipeline> pipeline =
6226         createGraphicsPipeline(*pipelineLayout, *renderPass, m_data.sizeX, m_data.sizeY, shaders, topology, 0U);
6227     Move<VkCommandPool> cmdPool =
6228         createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, queueIndex);
6229     Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
6230 
6231     PushConstant pc{};
6232     pc.width                 = m_data.sizeX;
6233     pc.height                = m_data.sizeY;
6234     pc.primitiveStride       = primitiveStride;
6235     pc.invocationStride      = 0u;
6236     pc.subgroupStride        = hostSubgroupStride;
6237     pc.enableInvocationIndex = VK_FALSE;
6238 
6239     auto callRecordDrawingAndSubmit = std::bind(
6240         &ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit, this, *cmdBuffer, *pipelineLayout, *pipeline,
6241         *descriptorSet, std::cref(pc), std::cref(renderBeginInfo), **vertexBuffer, (primitiveStride * 3u), **image);
6242 
6243     // compute "maxLoc", which is a potential maximum number of locations written
6244     callRecordDrawingAndSubmit();
6245 
6246     // Take the maximum of "maxLoc" over all invocations.
6247     invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6248     auto rangeLoc = makeStdBeginEnd<const uint32_t>(ptrs[OutputCounts], (hostSubgroupStride * 128u * primitiveStride));
6249     const uint32_t computedShaderMaxLoc = *max_element(rangeLoc.first, rangeLoc.second);
6250     log << tcu::TestLog::Message << "Computed maxLoc in the shader: " << computedShaderMaxLoc
6251         << tcu::TestLog::EndMessage;
6252 
6253     if (computedShaderMaxLoc >= FragmentRandomProgram::experimentalOutLocSize)
6254     {
6255         return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6256                                "Calculated maxLoc from a shader (which is " + de::toString(computedShaderMaxLoc) +
6257                                    ") "
6258                                    "exceeds BALLOT_STACK_SIZE (which is " +
6259                                    de::toString(FragmentRandomProgram::experimentalOutLocSize) +
6260                                    ").\n"
6261                                    "To repair this just increment slightly a " MAKETEXT(
6262                                        FragmentRandomProgram::experimentalOutLocSize) " "
6263                                                                                       "in line " +
6264                                    de::toString(BALLOT_STACK_SIZE_DEFVAL_LINE));
6265     }
6266 
6267     // If we need more space, reallocate OutputB::b[]
6268     if (computedShaderMaxLoc != simulationMaxLoc)
6269     {
6270         // Add one (to make sure no additional writes are done) and multiply by
6271         // the number of invocations and current primitive count
6272         maxLoc = (std::max(computedShaderMaxLoc, simulationMaxLoc) + 1) * (hostSubgroupStride * 128u * primitiveStride);
6273         sizes[OutputBallots] = maxLoc * sizeof(tcu::UVec4);
6274 
6275         if (sizes[OutputBallots] > limits.maxStorageBufferRange)
6276             TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
6277 
6278         try
6279         {
6280             buffers[OutputBallots] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
6281                 vk, device, allocator,
6282                 makeBufferCreateInfo(sizes[OutputBallots], usages[OutputBallots] | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
6283                                                                VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
6284                 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
6285         }
6286         catch (tcu::ResourceError &)
6287         {
6288             // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6289             return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6290                                    "Failed device memory allocation " + de::toString(sizes[OutputBallots]) + " bytes");
6291         }
6292         bufferDescriptors[OutputBallots] = makeDescriptorBufferInfo(**buffers[OutputBallots], 0, sizes[OutputBallots]);
6293         ptrs[OutputBallots]              = buffers[OutputBallots]->getAllocation().getHostPtr();
6294 
6295         vk::DescriptorSetUpdateBuilder setUpdateBuilder2;
6296         setUpdateBuilder2.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(OutputBallots),
6297                                       VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[OutputBallots]);
6298         setUpdateBuilder2.update(vk, device);
6299     }
6300 
6301     // Clear any writes to ballots/stores OutputB::b[] aka buffer[OutputBallots] during the counting pass
6302     // Note that its size would may change since the first memory allocation
6303     deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
6304     // Clear any writes to counting OutputC::loc[] aka buffer[OutputCounts] during the counting pass
6305     deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
6306     // Clear any writes to counting OutputP::p[] aka buffer[OutputPriMap] during the counting pass
6307     deMemset(ptrs[OutputPriMap], 0, (size_t)sizes[OutputPriMap]);
6308 
6309     // flush them all to the GPU
6310     flushAlloc(vk, device, buffers[OutputBallots]->getAllocation());
6311     flushAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6312     flushAlloc(vk, device, buffers[OutputPriMap]->getAllocation());
6313 
6314     // run the actual shader with updated PushConstant
6315     pc.enableInvocationIndex = VK_TRUE;
6316     callRecordDrawingAndSubmit();
6317 
6318     invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6319     invalidateAlloc(vk, device, buffers[OutputBallots]->getAllocation());
6320     invalidateAlloc(vk, device, buffers[OutputPriMap]->getAllocation());
6321 
6322     // Simulate execution on the CPU, and compare against the GPU result
6323     try
6324     {
6325         ref.resize(maxLoc, tcu::UVec4());
6326     }
6327     catch (const std::bad_alloc &)
6328     {
6329         // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6330         return tcu::TestStatus(QP_TEST_RESULT_NOT_SUPPORTED,
6331                                "Failed system memory allocation " + de::toString(maxLoc * sizeof(uint64_t)) + " bytes");
6332     }
6333 
6334     std::fill(primitiveMap.begin(), primitiveMap.end(), 0);
6335     auto primitiveMapRange = makeStdBeginEnd<const uint32_t>(ptrs[OutputPriMap], (fragmentStride * primitiveStride));
6336     std::copy(primitiveMapRange.first, primitiveMapRange.second, primitiveMap.begin());
6337 
6338     const FragmentRandomProgram::Arrangement a(primitiveMap, m_data.sizeX, m_data.sizeY, m_subgroupSize,
6339                                                primitiveStride);
6340     const tcu::UVec4 *ballots = static_cast<tcu::UVec4 *>(ptrs[OutputBallots]);
6341 
6342     program->execute(m_context.getTestContext().getWatchDog(), false, m_subgroupSize, fragmentStride, primitiveStride,
6343                      ref, log, primitiveMap, ballots);
6344 
6345     const uint32_t finalMaxLoc = std::max(computedShaderMaxLoc, simulationMaxLoc);
6346     const qpTestResult res     = calculateAndLogResultEx(log, ballots, ref, finalMaxLoc, a, PrintMode::None);
6347 
6348     return tcu::TestStatus(res, qpGetTestResultName(res));
6349 }
6350 
createVertexBufferAndFlush(uint32_t cellsHorz,uint32_t cellsVert,VkPrimitiveTopology topology)6351 de::MovePtr<BufferWithMemory> ReconvergenceTestVertexInstance::createVertexBufferAndFlush(uint32_t cellsHorz,
6352                                                                                           uint32_t cellsVert,
6353                                                                                           VkPrimitiveTopology topology)
6354 {
6355     DE_UNREF(topology);
6356     DE_ASSERT(VK_PRIMITIVE_TOPOLOGY_POINT_LIST == topology);
6357     const std::vector<tcu::Vec4> vertices =
6358         VertexRandomProgram::Arrangement::generatePrimitives(cellsHorz, cellsVert, VertexRandomProgram::fillPercentage);
6359     return ReconvergenceTestGraphicsInstance::createVertexBufferAndFlush(vertices);
6360 }
6361 
createShaders(void)6362 std::vector<Move<VkShaderModule>> ReconvergenceTestVertexInstance::createShaders(void)
6363 {
6364     const DeviceInterface &vk = m_context.getDeviceInterface();
6365     const VkDevice device     = m_context.getDevice();
6366 
6367     Move<VkShaderModule> vertex   = createShaderModule(vk, device, m_context.getBinaryCollection().get("test"), 0);
6368     Move<VkShaderModule> fragment = createShaderModule(vk, device, m_context.getBinaryCollection().get("frag"), 0);
6369 
6370     // { #vert, #frag, #tesc, tese, geom }; if any
6371     std::vector<Move<VkShaderModule>> shaders;
6372     shaders.emplace_back(vertex);
6373     shaders.emplace_back(fragment);
6374 
6375     return shaders;
6376 }
6377 
iterate(void)6378 tcu::TestStatus ReconvergenceTestVertexInstance::iterate(void)
6379 {
6380     const VkPhysicalDeviceLimits &limits = m_context.getDeviceProperties().limits;
6381     if (sizeof(PushConstant) > limits.maxPushConstantsSize)
6382     {
6383         return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6384                                "PushConstant size " + std::to_string(sizeof(PushConstant)) + " exceeds device limit " +
6385                                    std::to_string(limits.maxPushConstantsSize));
6386     }
6387 
6388     const DeviceInterface &vk          = m_context.getDeviceInterface();
6389     const VkDevice device              = m_context.getDevice();
6390     Allocator &allocator               = m_context.getDefaultAllocator();
6391     const uint32_t queueIndex          = m_context.getUniversalQueueFamilyIndex();
6392     add_ref<tcu::TestLog> log          = m_context.getTestContext().getLog();
6393     const VkPrimitiveTopology topology = VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
6394     const uint32_t fragmentStride      = uint32_t(m_data.sizeX * m_data.sizeY);
6395     const uint32_t invocationStride =
6396         static_cast<uint32_t>(VertexRandomProgram::Arrangement::generatePrimitives(m_data.sizeX, m_data.sizeY,
6397                                                                                    VertexRandomProgram::fillPercentage)
6398                                   .size());
6399 
6400     de::MovePtr<VertexRandomProgram> program(new VertexRandomProgram(m_data));
6401     program->generateRandomProgram(m_context.getTestContext().getWatchDog(), log);
6402 
6403     // simulate content of outputP buffer
6404     std::vector<uint32_t> outputP =
6405         VertexRandomProgram::Arrangement::generateOutputPvector(m_subgroupSize, invocationStride);
6406 
6407     std::vector<tcu::UVec4> ref;
6408     const uint32_t hostMaxLoc = program->execute(m_context.getTestContext().getWatchDog(), true, m_subgroupSize,
6409                                                  fragmentStride, invocationStride, ref, log, outputP, nullptr);
6410     log << tcu::TestLog::Message << "Rendering area  : " << tcu::UVec2(m_data.sizeX, m_data.sizeY)
6411         << tcu::TestLog::EndMessage;
6412     log << tcu::TestLog::Message << "invocationStride: " << invocationStride << tcu::TestLog::EndMessage;
6413     log << tcu::TestLog::Message << "Simulated maxLoc: " << hostMaxLoc << tcu::TestLog::EndMessage;
6414     // maxLoc is per-invocation. Add one (to make sure no additional writes are done).
6415     uint32_t maxLoc = hostMaxLoc;
6416     maxLoc += 1;
6417     maxLoc *= invocationStride;
6418 
6419     constexpr uint32_t bufferCount = 4u;
6420     enum Bindings
6421     {
6422         InputA,
6423         OutputBallots,
6424         OutputCounts,
6425         OutputPrimitives
6426     };
6427 
6428     de::MovePtr<BufferWithMemory> buffers[bufferCount];
6429     vk::VkDescriptorBufferInfo bufferDescriptors[bufferCount];
6430 
6431     uint32_t counts[bufferCount]{// InputA  { uint    a[]; } inputA;
6432                                  uint32_t(m_data.sizeX * m_data.sizeY),
6433                                  // OutputB { uvec2   b[]; } outputB;
6434                                  maxLoc,
6435                                  // OutputC { uint loc[]; } outputC;
6436                                  invocationStride,
6437                                  // OutputP { uint p[]; } outputP;
6438                                  uint32_t(outputP.size())};
6439 
6440     VkDeviceSize sizes[bufferCount]{// InputA  { uint    a[]; } inputA;
6441                                     counts[InputA] * sizeof(uint32_t),
6442                                     // OutputB { uvec2   b[]; } outputB;
6443                                     counts[OutputBallots] * sizeof(tcu::UVec4),
6444                                     // OutputC { uint loc[]; } outputC;
6445                                     counts[OutputCounts] * sizeof(uint32_t),
6446                                     // OutputP { uint p[]; } outputP;
6447                                     counts[OutputPrimitives] * sizeof(uint32_t)};
6448 
6449     const VkBufferUsageFlags cmnUsages = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
6450     VkBufferUsageFlags usages[bufferCount]{
6451         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6452         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6453         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6454         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6455     };
6456 
6457     // allocate buffers
6458     for (uint32_t i = 0; i < bufferCount; ++i)
6459     {
6460         if (sizes[i] > limits.maxStorageBufferRange)
6461             TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
6462 
6463         try
6464         {
6465             buffers[i] = de::MovePtr<BufferWithMemory>(
6466                 new BufferWithMemory(vk, device, allocator, makeBufferCreateInfo(sizes[i], usages[i] | cmnUsages),
6467                                      MemoryRequirement::HostVisible | MemoryRequirement::Cached));
6468         }
6469         catch (tcu::ResourceError &)
6470         {
6471             // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6472             return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6473                                    "Failed device memory allocation " + de::toString(sizes[i]) + " bytes");
6474         }
6475         bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, sizes[i]);
6476     }
6477 
6478     // get raw pointers to previously allocated buffers
6479     void *ptrs[bufferCount];
6480     for (uint32_t i = 0; i < bufferCount; ++i)
6481     {
6482         ptrs[i] = buffers[i]->getAllocation().getHostPtr();
6483     }
6484 
6485     // populate buffers with their destination
6486     {
6487         auto rangeBufferA = makeStdBeginEnd<uint32_t>(ptrs[InputA], counts[InputA]);
6488         std::iota(rangeBufferA.first, rangeBufferA.second, 0u);
6489     }
6490     deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
6491     deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
6492     deMemset(ptrs[OutputPrimitives], 0, (size_t)sizes[OutputPrimitives]);
6493 
6494     // (...) and flush them to the GPU
6495     for (uint32_t i = 0; i < bufferCount; ++i)
6496     {
6497         flushAlloc(vk, device, buffers[i]->getAllocation());
6498     }
6499 
6500     VkDescriptorType descTypes[bufferCount]{VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6501                                             VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER};
6502 
6503     vk::DescriptorSetLayoutBuilder layoutBuilder;
6504     for (uint32_t i = 0; i < bufferCount; ++i)
6505     {
6506         layoutBuilder.addSingleBinding(descTypes[i], m_data.shaderStage);
6507     }
6508     vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
6509 
6510     vk::DescriptorPoolBuilder poolBuilder;
6511     for (uint32_t i = 0; i < bufferCount; ++i)
6512     {
6513         poolBuilder.addType(descTypes[i], 1);
6514     }
6515     vk::Unique<vk::VkDescriptorPool> descriptorPool(
6516         poolBuilder.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
6517     vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
6518 
6519     vk::DescriptorSetUpdateBuilder setUpdateBuilder;
6520     for (uint32_t i = 0; i < bufferCount; ++i)
6521     {
6522         setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(i), descTypes[i],
6523                                      &bufferDescriptors[i]);
6524     }
6525     setUpdateBuilder.update(vk, device);
6526 
6527     const VkPushConstantRange pushConstantRange{
6528         (VkShaderStageFlags)m_data.shaderStage, // VkShaderStageFlags stageFlags;
6529         0u,                                     // uint32_t offset;
6530         sizeof(PushConstant)                    // uint32_t size;
6531     };
6532 
6533     const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{
6534         VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
6535         nullptr,                                       // pNext
6536         (VkPipelineLayoutCreateFlags)0,                // flags
6537         1u,                                            // setLayoutCount
6538         &descriptorSetLayout.get(),                    // pSetLayouts
6539         1u,                                            // pushConstantRangeCount
6540         &pushConstantRange,                            // pPushConstantRanges
6541     };
6542 
6543     const uint32_t imageWidth  = m_data.sizeX;
6544     const uint32_t imageHeight = m_data.sizeY;
6545     const VkFormat format      = VK_FORMAT_R8G8B8A8_UNORM;
6546     const VkImageCreateInfo imageCreateInfo{
6547         VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // VkStructureType sType;
6548         nullptr,                             // const void* pNext;
6549         VkImageCreateFlags(0),               // VkImageCreateFlags flags;
6550         VK_IMAGE_TYPE_2D,                    // VkImageType imageType;
6551         format,                              // VkFormat format;
6552         {imageWidth, imageHeight, 1u},       // VkExtent3D extent;
6553         1u,                                  // uint32_t mipLevels;
6554         1u,                                  // uint32_t arrayLayers;
6555         VK_SAMPLE_COUNT_1_BIT,               // VkSampleCountFlagBits samples;
6556         VK_IMAGE_TILING_OPTIMAL,             // VkImageTiling tiling;
6557         VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, // VkImageUsageFlags usage;
6558         VK_SHARING_MODE_EXCLUSIVE,           // VkSharingMode sharingMode;
6559         0u,                                  // uint32_t queueFamilyIndexCount;
6560         0u,                                  // const uint32_t* pQueueFamilyIndices;
6561         VK_IMAGE_LAYOUT_UNDEFINED            // VkImageLayout initialLayout;
6562     };
6563     const VkImageSubresourceRange rscRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
6564     de::MovePtr<ImageWithMemory> image(
6565         new ImageWithMemory(vk, device, allocator, imageCreateInfo, vk::MemoryRequirement::Any));
6566     Move<VkImageView> view        = makeImageView(vk, device, **image, VK_IMAGE_VIEW_TYPE_2D, format, rscRange);
6567     Move<VkRenderPass> renderPass = makeRenderPass(vk, device, format);
6568     Move<VkFramebuffer> framebuffer =
6569         makeFramebuffer(vk, device, *renderPass, *view, m_data.sizeX, m_data.sizeY, rscRange.layerCount);
6570     de::MovePtr<BufferWithMemory> vertexBuffer  = createVertexBufferAndFlush(m_data.sizeX, m_data.sizeY, topology);
6571     const VkRenderPassBeginInfo renderBeginInfo = makeRenderPassBeginInfo(*renderPass, *framebuffer);
6572     const Shaders shaders                       = createShaders();
6573     Move<VkPipelineLayout> pipelineLayout       = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
6574     Move<VkPipeline> pipeline =
6575         createGraphicsPipeline(*pipelineLayout, *renderPass, imageWidth, imageHeight, shaders, topology, 0u);
6576     Move<VkCommandPool> cmdPool =
6577         createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, queueIndex);
6578     Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
6579 
6580     PushConstant pc{};
6581     pc.invocationStride      = invocationStride;
6582     pc.width                 = m_data.sizeX;
6583     pc.height                = m_data.sizeY;
6584     pc.enableInvocationIndex = VK_FALSE;
6585 
6586     auto callRecordDrawingAndSubmit = std::bind(&ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit, this,
6587                                                 *cmdBuffer, *pipelineLayout, *pipeline, *descriptorSet, std::cref(pc),
6588                                                 std::cref(renderBeginInfo), **vertexBuffer, invocationStride, **image);
6589 
6590     // compute "maxLoc", which is a potential maximum number of locations written
6591     callRecordDrawingAndSubmit();
6592 
6593     // Take the maximum of "maxLoc" over all invocations.
6594     invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6595     auto rangeLoc               = makeStdBeginEnd<const uint32_t>(ptrs[OutputCounts], counts[OutputCounts]);
6596     const uint32_t shaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
6597     log << tcu::TestLog::Message << "Computed maxLoc in shader: " << shaderMaxLoc << tcu::TestLog::EndMessage;
6598 
6599     // If we need more space, reallocate OutputB::b[] aka buffers[1]
6600     if (shaderMaxLoc != hostMaxLoc)
6601     {
6602         // Add one (to make sure no additional writes are done) and multiply by
6603         // the number of invocations and current primitive count
6604         maxLoc                = (std::max(shaderMaxLoc, hostMaxLoc) + 1u) * invocationStride;
6605         counts[OutputBallots] = maxLoc;
6606         sizes[OutputBallots]  = counts[OutputBallots] * sizeof(tcu::UVec4);
6607 
6608         if (sizes[OutputBallots] > limits.maxStorageBufferRange)
6609             TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
6610 
6611         try
6612         {
6613             buffers[OutputBallots] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
6614                 vk, device, allocator, makeBufferCreateInfo(sizes[OutputBallots], usages[OutputBallots] | cmnUsages),
6615                 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
6616         }
6617         catch (tcu::ResourceError &)
6618         {
6619             // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6620             return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6621                                    "Failed device memory allocation " + de::toString(sizes[OutputBallots]) + " bytes");
6622         }
6623         bufferDescriptors[OutputBallots] = makeDescriptorBufferInfo(**buffers[OutputBallots], 0, sizes[OutputBallots]);
6624         ptrs[OutputBallots]              = buffers[OutputBallots]->getAllocation().getHostPtr();
6625 
6626         vk::DescriptorSetUpdateBuilder setUpdateBuilder2;
6627         setUpdateBuilder2.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(OutputBallots),
6628                                       VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[OutputBallots]);
6629         setUpdateBuilder2.update(vk, device);
6630     }
6631 
6632     // Clear any writes to ballots/stores OutputB::b[] aka buffer[1] during the counting pass
6633     // Note that its size would may change since the first memory allocation
6634     deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
6635     deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
6636     deMemset(ptrs[OutputPrimitives], 0, (size_t)sizes[OutputPrimitives]);
6637 
6638     // flush them all to the GPU
6639     flushAlloc(vk, device, buffers[OutputBallots]->getAllocation());
6640     flushAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6641     flushAlloc(vk, device, buffers[OutputPrimitives]->getAllocation());
6642 
6643     // run the actual shader with updated PushConstant
6644     pc.enableInvocationIndex = VK_TRUE;
6645     callRecordDrawingAndSubmit();
6646 
6647     invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6648     const uint32_t finalShaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
6649     log << tcu::TestLog::Message << "Final maxLoc from shader: " << finalShaderMaxLoc << tcu::TestLog::EndMessage;
6650     if (finalShaderMaxLoc != shaderMaxLoc)
6651     {
6652         return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6653                                "maxLoc differs across shader invocations, expected: " + de::toString(shaderMaxLoc) +
6654                                    " got: " + de::toString(finalShaderMaxLoc));
6655     }
6656 
6657     invalidateAlloc(vk, device, buffers[OutputBallots]->getAllocation());
6658     const tcu::UVec4 *ballots = static_cast<tcu::UVec4 *>(ptrs[OutputBallots]);
6659 
6660     invalidateAlloc(vk, device, buffers[OutputPrimitives]->getAllocation());
6661     auto outputPrange = makeStdBeginEnd<uint32_t>(ptrs[OutputPrimitives], counts[OutputPrimitives]);
6662     std::copy(outputPrange.first, outputPrange.second, outputP.begin());
6663 
6664     try
6665     {
6666         ref.resize(counts[OutputBallots], tcu::UVec4(0u, 0u, 0u, 0u));
6667     }
6668     catch (const std::bad_alloc &)
6669     {
6670         // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6671         return tcu::TestStatus(QP_TEST_RESULT_NOT_SUPPORTED,
6672                                "Failed system memory allocation " + de::toString(sizes[OutputBallots]) + " bytes");
6673     }
6674 
6675     // Simulate execution on the CPU, and compare against the GPU result
6676     const uint32_t finalHostMaxLoc = program->execute(m_context.getTestContext().getWatchDog(), false, m_subgroupSize,
6677                                                       fragmentStride, invocationStride, ref, log, outputP, ballots);
6678 
6679     const qpTestResult res = calculateAndLogResultEx(log, ballots, ref, finalHostMaxLoc, PrintMode::None);
6680 
6681     return tcu::TestStatus(res, qpGetTestResultName(res));
6682 }
6683 
calculateAndLogResultEx(add_ref<tcu::TestLog> log,const tcu::UVec4 * result,const std::vector<tcu::UVec4> & ref,const uint32_t maxLoc,const PrintMode printMode)6684 qpTestResult_e ReconvergenceTestVertexInstance::calculateAndLogResultEx(add_ref<tcu::TestLog> log,
6685                                                                         const tcu::UVec4 *result,
6686                                                                         const std::vector<tcu::UVec4> &ref,
6687                                                                         const uint32_t maxLoc,
6688                                                                         const PrintMode printMode)
6689 {
6690     DE_UNREF(maxLoc);
6691     DE_UNREF(printMode);
6692 
6693     qpTestResult res                  = QP_TEST_RESULT_PASS;
6694     uint32_t mismatchCount            = 0u;
6695     const uint32_t printMismatchCount = 5u;
6696 
6697     // With maximal reconvergence, we should expect the output to exactly match the reference.
6698     const uint32_t ballotStoreCount = static_cast<uint32_t>(ref.size());
6699     for (uint32_t i = 0; i < ballotStoreCount; ++i)
6700     {
6701         const Ballot resultVal(result[i], m_subgroupSize);
6702         const Ballot refVal(ref.at(i), m_subgroupSize);
6703         if (resultVal != refVal)
6704         {
6705             if (mismatchCount++ < printMismatchCount)
6706             {
6707                 res = QP_TEST_RESULT_FAIL;
6708                 log << tcu::TestLog::Message << "Mismatch at " << i << "\nexpected: " << resultVal
6709                     << "\n     got: " << refVal << tcu::TestLog::EndMessage;
6710                 if (printMode == PrintMode::Console)
6711                 {
6712                     std::cout << "Mismatch at " << i << "\nexpected: " << resultVal << "\n     got: " << refVal
6713                               << std::endl;
6714                 }
6715             }
6716         }
6717     }
6718 
6719     log << tcu::TestLog::Message << "Mismatch count: " << mismatchCount << " from " << ballotStoreCount
6720         << tcu::TestLog::EndMessage;
6721     if (printMode == PrintMode::Console)
6722     {
6723         std::cout << "Mismatch count: " << mismatchCount << " from " << ballotStoreCount << std::endl;
6724     }
6725 
6726     return res;
6727 }
6728 
createShaders(void)6729 std::vector<Move<VkShaderModule>> ReconvergenceTestTessCtrlInstance::createShaders(void)
6730 {
6731     const DeviceInterface &vk = m_context.getDeviceInterface();
6732     const VkDevice device     = m_context.getDevice();
6733 
6734     Move<VkShaderModule> vertex     = createShaderModule(vk, device, m_context.getBinaryCollection().get("vert"));
6735     Move<VkShaderModule> fragment   = createShaderModule(vk, device, m_context.getBinaryCollection().get("frag"));
6736     Move<VkShaderModule> control    = createShaderModule(vk, device, m_context.getBinaryCollection().get("test"));
6737     Move<VkShaderModule> evaluation = createShaderModule(vk, device, m_context.getBinaryCollection().get("tese"));
6738 
6739     // { #vert, #frag, #tesc, #tese, geom }; if any
6740     std::vector<Move<VkShaderModule>> shaders;
6741     shaders.emplace_back(vertex);
6742     shaders.emplace_back(fragment);
6743     shaders.emplace_back(control);
6744     shaders.emplace_back(evaluation);
6745 
6746     return shaders;
6747 }
6748 
iterate(void)6749 tcu::TestStatus ReconvergenceTestTessCtrlInstance::iterate(void)
6750 {
6751     const DeviceInterface &vk = m_context.getDeviceInterface();
6752     const VkDevice device     = m_context.getDevice();
6753     Allocator &allocator      = m_context.getDefaultAllocator();
6754     const uint32_t queueIndex = m_context.getUniversalQueueFamilyIndex();
6755     add_ref<tcu::TestLog> log = m_context.getTestContext().getLog();
6756 
6757     if (m_subgroupSize < TessCtrlRandomProgram::minSubgroupSize || m_subgroupSize > 64)
6758     {
6759         std::stringstream str;
6760         str << "Subgroup size less than " << TessCtrlRandomProgram::minSubgroupSize
6761             << " or greater than 64 not handled.";
6762         str.flush();
6763         TCU_THROW(TestError, str.str());
6764     }
6765 
6766     deRandom rnd;
6767     deRandom_init(&rnd, m_data.seed);
6768 
6769     vk::VkPhysicalDeviceProperties2 properties2;
6770     deMemset(&properties2, 0, sizeof(properties2));
6771     properties2.sType = vk::VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
6772     m_context.getInstanceInterface().getPhysicalDeviceProperties2(m_context.getPhysicalDevice(), &properties2);
6773     const VkPhysicalDeviceLimits &limits = properties2.properties.limits;
6774 
6775     const uint32_t patchControlPoints = 1;
6776     const uint32_t vertexCount =
6777         (m_subgroupSize / TessCtrlRandomProgram::minSubgroupSize) * patchControlPoints * m_data.sizeX;
6778     const uint32_t primitiveStride = vertexCount / patchControlPoints;
6779     de::MovePtr<BufferWithMemory> vertexBuffer =
6780         createVertexBufferAndFlush(vertexCount, 1u, VK_PRIMITIVE_TOPOLOGY_PATCH_LIST);
6781     const uint32_t invocationStride = vertexCount * TessCtrlRandomProgram::minSubgroupSize;
6782     DE_ASSERT(invocationStride < MAX_INVOCATIONS_ALL_TESTS);
6783 
6784     log << tcu::TestLog::Message << "LayoutVertexOut:    " << (uint32_t)TessCtrlRandomProgram::minSubgroupSize
6785         << tcu::TestLog::EndMessage;
6786     log << tcu::TestLog::Message << "patchControlPoints: " << patchControlPoints << tcu::TestLog::EndMessage;
6787     log << tcu::TestLog::Message << "primitiveStride:    " << primitiveStride << tcu::TestLog::EndMessage;
6788     log << tcu::TestLog::Message << "invocationStride:   " << invocationStride << tcu::TestLog::EndMessage;
6789     log << tcu::TestLog::Message << "usedSubgroupCount:  " << m_data.sizeX << tcu::TestLog::EndMessage;
6790 
6791     de::MovePtr<TessCtrlRandomProgram> program(new TessCtrlRandomProgram(m_data, invocationStride));
6792     program->generateRandomProgram(m_context.getTestContext().getWatchDog(), log);
6793 
6794     std::vector<uint64_t> ref;
6795     const uint32_t simulationMaxLoc = program->simulate(true, m_subgroupSize, ref);
6796     log << tcu::TestLog::Message << "simulated maxLoc: " << simulationMaxLoc << tcu::TestLog::EndMessage;
6797     // maxLoc is per-invocation. Add one (to make sure no additional writes are done)
6798     uint32_t maxLoc = simulationMaxLoc;
6799     maxLoc += 1;
6800     maxLoc *= invocationStride;
6801 
6802     constexpr uint32_t bufferCount = 3;
6803     enum Bindings
6804     {
6805         InputA,
6806         OutputBallots,
6807         OutputCounts,
6808     };
6809 
6810     de::MovePtr<BufferWithMemory> buffers[bufferCount];
6811     vk::VkDescriptorBufferInfo bufferDescriptors[bufferCount];
6812 
6813     VkDeviceSize sizes[bufferCount]{
6814         // InputA  { uint    a[]; } inputA;  filled with a[i] == i
6815         invocationStride * sizeof(uint32_t),
6816         // OutputB { uvec2   b[]; } outputB;
6817         maxLoc * sizeof(uint64_t),
6818         // OutputC { uint loc[]; } outputC;
6819         invocationStride * sizeof(uint32_t),
6820     };
6821 
6822     VkBufferUsageFlags usages[bufferCount]{
6823         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6824         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6825         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6826     };
6827 
6828     // allocate buffers
6829     for (uint32_t i = 0; i < bufferCount; ++i)
6830     {
6831         if (sizes[i] > limits.maxStorageBufferRange)
6832             TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
6833 
6834         try
6835         {
6836             buffers[i] = de::MovePtr<BufferWithMemory>(
6837                 new BufferWithMemory(vk, device, allocator,
6838                                      makeBufferCreateInfo(sizes[i], usages[i] | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
6839                                                                         VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
6840                                      MemoryRequirement::HostVisible | MemoryRequirement::Cached));
6841         }
6842         catch (tcu::ResourceError &)
6843         {
6844             // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6845             return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6846                                    "Failed device memory allocation " + de::toString(sizes[i]) + " bytes");
6847         }
6848         bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, sizes[i]);
6849     }
6850 
6851     // get raw pointers to previously allocated buffers
6852     void *ptrs[bufferCount];
6853     for (uint32_t i = 0; i < bufferCount; ++i)
6854     {
6855         ptrs[i] = (uint32_t *)buffers[i]->getAllocation().getHostPtr();
6856     }
6857 
6858     // populate buffers with their destination
6859     {
6860         auto rangeBufferA = makeStdBeginEnd<uint32_t>(ptrs[InputA], invocationStride);
6861         std::iota(rangeBufferA.first, rangeBufferA.second, 0u);
6862     }
6863     deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
6864     deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
6865 
6866     // (...) and flush them to the GPU
6867     for (uint32_t i = 0; i < bufferCount; ++i)
6868     {
6869         flushAlloc(vk, device, buffers[i]->getAllocation());
6870     }
6871 
6872     VkDescriptorType descTypes[bufferCount]{
6873         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6874         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6875         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6876     };
6877 
6878     vk::DescriptorSetLayoutBuilder layoutBuilder;
6879     for (uint32_t i = 0; i < bufferCount; ++i)
6880     {
6881         layoutBuilder.addSingleBinding(descTypes[i], m_data.shaderStage);
6882     }
6883     vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
6884 
6885     vk::DescriptorPoolBuilder poolBuilder;
6886     for (uint32_t i = 0; i < bufferCount; ++i)
6887     {
6888         poolBuilder.addType(descTypes[i], 1);
6889     }
6890     vk::Unique<vk::VkDescriptorPool> descriptorPool(
6891         poolBuilder.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
6892     vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
6893 
6894     vk::DescriptorSetUpdateBuilder setUpdateBuilder;
6895     for (uint32_t i = 0; i < bufferCount; ++i)
6896     {
6897         setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(i), descTypes[i],
6898                                      &bufferDescriptors[i]);
6899     }
6900     setUpdateBuilder.update(vk, device);
6901 
6902     const VkPushConstantRange pushConstantRange{
6903         (VkShaderStageFlags)m_data.shaderStage, // VkShaderStageFlags stageFlags;
6904         0u,                                     // uint32_t offset;
6905         sizeof(PushConstant)                    // uint32_t size;
6906     };
6907 
6908     // TODO: verify that PushConstant is available on running machine
6909 
6910     const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{
6911         VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
6912         nullptr,                                       // pNext
6913         (VkPipelineLayoutCreateFlags)0,                // flags
6914         1u,                                            // setLayoutCount
6915         &descriptorSetLayout.get(),                    // pSetLayouts
6916         1u,                                            // pushConstantRangeCount
6917         &pushConstantRange,                            // pPushConstantRanges
6918     };
6919 
6920     const uint32_t imageWidth  = 256;
6921     const uint32_t imageHeight = 256;
6922     const VkFormat format      = VK_FORMAT_R8G8B8A8_UNORM;
6923     const VkImageCreateInfo imageCreateInfo{
6924         VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // VkStructureType sType;
6925         nullptr,                             // const void* pNext;
6926         VkImageCreateFlags(0),               // VkImageCreateFlags flags;
6927         VK_IMAGE_TYPE_2D,                    // VkImageType imageType;
6928         format,                              // VkFormat format;
6929         {imageWidth, imageHeight, 1u},       // VkExtent3D extent;
6930         1u,                                  // uint32_t mipLevels;
6931         1u,                                  // uint32_t arrayLayers;
6932         VK_SAMPLE_COUNT_1_BIT,               // VkSampleCountFlagBits samples;
6933         VK_IMAGE_TILING_OPTIMAL,             // VkImageTiling tiling;
6934         VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, // VkImageUsageFlags usage;
6935         VK_SHARING_MODE_EXCLUSIVE,           // VkSharingMode sharingMode;
6936         0u,                                  // uint32_t queueFamilyIndexCount;
6937         0u,                                  // const uint32_t* pQueueFamilyIndices;
6938         VK_IMAGE_LAYOUT_UNDEFINED            // VkImageLayout initialLayout;
6939     };
6940     const VkImageSubresourceRange rscRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
6941     de::MovePtr<ImageWithMemory> image(
6942         new ImageWithMemory(vk, device, allocator, imageCreateInfo, vk::MemoryRequirement::Any));
6943     Move<VkImageView> view        = makeImageView(vk, device, **image, VK_IMAGE_VIEW_TYPE_2D, format, rscRange);
6944     Move<VkRenderPass> renderPass = makeRenderPass(vk, device, format);
6945     Move<VkFramebuffer> framebuffer =
6946         makeFramebuffer(vk, device, *renderPass, *view, m_data.sizeX, m_data.sizeY, rscRange.layerCount);
6947     const VkRenderPassBeginInfo renderBeginInfo = makeRenderPassBeginInfo(*renderPass, *framebuffer);
6948     const Shaders shaders                       = createShaders();
6949     Move<VkPipelineLayout> pipelineLayout       = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
6950     Move<VkPipeline> pipeline = createGraphicsPipeline(*pipelineLayout, *renderPass, imageWidth, imageHeight, shaders,
6951                                                        VK_PRIMITIVE_TOPOLOGY_PATCH_LIST, patchControlPoints);
6952     Move<VkCommandPool> cmdPool =
6953         createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, queueIndex);
6954     Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
6955 
6956     PushConstant pc{};
6957     pc.invocationStride = 0u;
6958     pc.width            = TessCtrlRandomProgram::minSubgroupSize;
6959     pc.height           = patchControlPoints;
6960     pc.primitiveStride  = primitiveStride;
6961 
6962     auto callRecordDrawingAndSubmit = std::bind(&ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit, this,
6963                                                 *cmdBuffer, *pipelineLayout, *pipeline, *descriptorSet, std::cref(pc),
6964                                                 std::cref(renderBeginInfo), **vertexBuffer, vertexCount, **image);
6965 
6966     // compute "maxLoc", which is a potential maximum number of locations written
6967     callRecordDrawingAndSubmit();
6968 
6969     // Take the maximum of "maxLoc" over all invocations.
6970     invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6971     auto rangeLoc                       = makeStdBeginEnd<const uint32_t>(ptrs[OutputCounts], invocationStride);
6972     const uint32_t computedShaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
6973     log << tcu::TestLog::Message << "computed shaderMaxLoc: " << computedShaderMaxLoc << tcu::TestLog::EndMessage;
6974 
6975     // If we need more space, reallocate OutputB::b[] aka buffers[1]
6976     if (computedShaderMaxLoc > simulationMaxLoc)
6977     {
6978         // Add one (to make sure no additional writes are done) and multiply by
6979         // the number of invocations and current primitive count
6980         maxLoc               = (computedShaderMaxLoc + 1) * invocationStride;
6981         sizes[OutputBallots] = maxLoc * sizeof(uint64_t);
6982 
6983         if (sizes[OutputBallots] > limits.maxStorageBufferRange)
6984             TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
6985 
6986         try
6987         {
6988             buffers[OutputBallots] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
6989                 vk, device, allocator,
6990                 makeBufferCreateInfo(sizes[1], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
6991                                                    VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
6992                 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
6993         }
6994         catch (tcu::ResourceError &)
6995         {
6996             // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6997             return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6998                                    "Failed device memory allocation " + de::toString(sizes[OutputBallots]) + " bytes");
6999         }
7000         bufferDescriptors[OutputBallots] = makeDescriptorBufferInfo(**buffers[OutputBallots], 0, sizes[OutputBallots]);
7001         ptrs[OutputBallots]              = buffers[OutputBallots]->getAllocation().getHostPtr();
7002 
7003         vk::DescriptorSetUpdateBuilder setUpdateBuilder2;
7004         setUpdateBuilder2.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(OutputBallots),
7005                                       VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[OutputBallots]);
7006         setUpdateBuilder2.update(vk, device);
7007     }
7008 
7009     // Clear any writes to ballots/stores OutputB::b[] aka buffer[1] during the counting pass
7010     // Note that its size would may change since the first memory allocation
7011     deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
7012     // Clear any writes to counting OutputC::loc[] aka buffer[2] during the counting pass
7013     deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
7014 
7015     // flush them all to the GPU
7016     flushAlloc(vk, device, buffers[OutputBallots]->getAllocation());
7017     flushAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7018 
7019     // run the actual shader with updated PushConstant
7020     pc.invocationStride = invocationStride;
7021     pc.width            = TessCtrlRandomProgram::minSubgroupSize;
7022     pc.height           = patchControlPoints;
7023     pc.primitiveStride  = primitiveStride;
7024     callRecordDrawingAndSubmit();
7025 
7026     invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7027     const uint32_t finalShaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
7028     log << tcu::TestLog::Message << "final shaderMaxLoc: " << finalShaderMaxLoc << tcu::TestLog::EndMessage;
7029     if (finalShaderMaxLoc > computedShaderMaxLoc)
7030     {
7031         return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING, "maxLoc differs across shader invocations");
7032     }
7033 
7034     invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7035     invalidateAlloc(vk, device, buffers[OutputBallots]->getAllocation());
7036 
7037     // Simulate execution on the CPU, and compare against the GPU result
7038     try
7039     {
7040         ref.resize(maxLoc, 0ull);
7041     }
7042     catch (const std::bad_alloc &)
7043     {
7044         // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7045         return tcu::TestStatus(QP_TEST_RESULT_NOT_SUPPORTED,
7046                                "Failed system memory allocation " + de::toString(maxLoc * sizeof(uint64_t)) + " bytes");
7047     }
7048 
7049     program->simulate(false, m_subgroupSize, ref);
7050 
7051     const uint64_t *ballots = static_cast<uint64_t *>(ptrs[OutputBallots]);
7052     qpTestResult res        = calculateAndLogResult(ballots, ref, invocationStride, m_subgroupSize, finalShaderMaxLoc,
7053                                                     (invocationStride / 3), PrintMode::None);
7054 
7055     return tcu::TestStatus(res, qpGetTestResultName(res));
7056 }
7057 
createShaders(void)7058 std::vector<Move<VkShaderModule>> ReconvergenceTestTessEvalInstance::createShaders(void)
7059 {
7060     const DeviceInterface &vk = m_context.getDeviceInterface();
7061     const VkDevice device     = m_context.getDevice();
7062 
7063     Move<VkShaderModule> vertex     = createShaderModule(vk, device, m_context.getBinaryCollection().get("vert"));
7064     Move<VkShaderModule> fragment   = createShaderModule(vk, device, m_context.getBinaryCollection().get("frag"));
7065     Move<VkShaderModule> control    = createShaderModule(vk, device, m_context.getBinaryCollection().get("tesc"));
7066     Move<VkShaderModule> evaluation = createShaderModule(vk, device, m_context.getBinaryCollection().get("test"));
7067 
7068     // { #vert, #frag, #tesc, #tese, geom }; if any
7069     std::vector<Move<VkShaderModule>> shaders;
7070     shaders.emplace_back(vertex);
7071     shaders.emplace_back(fragment);
7072     shaders.emplace_back(control);
7073     shaders.emplace_back(evaluation);
7074 
7075     return shaders;
7076 }
7077 
iterate(void)7078 tcu::TestStatus ReconvergenceTestTessEvalInstance::iterate(void)
7079 {
7080     const DeviceInterface &vk = m_context.getDeviceInterface();
7081     const VkDevice device     = m_context.getDevice();
7082     Allocator &allocator      = m_context.getDefaultAllocator();
7083     const uint32_t queueIndex = m_context.getUniversalQueueFamilyIndex();
7084     add_ref<tcu::TestLog> log = m_context.getTestContext().getLog();
7085 
7086     if (m_subgroupSize < TessEvalRandomProgram::quadInvocationCount || m_subgroupSize > 64)
7087     {
7088         std::stringstream str;
7089         str << "Subgroup size less than " << TessEvalRandomProgram::quadInvocationCount
7090             << " or greater than 64 not handled.";
7091         str.flush();
7092         TCU_THROW(TestError, str.str());
7093     }
7094 
7095     deRandom rnd;
7096     deRandom_init(&rnd, m_data.seed);
7097 
7098     vk::VkPhysicalDeviceProperties2 properties2;
7099     deMemset(&properties2, 0, sizeof(properties2));
7100     properties2.sType = vk::VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
7101     m_context.getInstanceInterface().getPhysicalDeviceProperties2(m_context.getPhysicalDevice(), &properties2);
7102     const VkPhysicalDeviceLimits &limits = properties2.properties.limits;
7103 
7104     const uint32_t patchesPerGroup             = m_subgroupSize / TessEvalRandomProgram::quadInvocationCount;
7105     const uint32_t primitiveStride             = patchesPerGroup * m_data.sizeX;
7106     const uint32_t invocationStride            = primitiveStride * TessEvalRandomProgram::quadInvocationCount;
7107     const std::vector<tcu::Vec4> vertices      = generateVertices(invocationStride, VK_PRIMITIVE_TOPOLOGY_POINT_LIST);
7108     const uint32_t vertexCount                 = uint32_t(vertices.size());
7109     de::MovePtr<BufferWithMemory> vertexBuffer = createVertexBufferAndFlush(vertices);
7110 
7111     DE_ASSERT(invocationStride <= MAX_INVOCATIONS_ALL_TESTS);
7112 
7113     de::MovePtr<TessEvalRandomProgram> program(new TessEvalRandomProgram(m_data, invocationStride));
7114     program->generateRandomProgram(m_context.getTestContext().getWatchDog(), log);
7115 
7116     std::vector<uint64_t> ref;
7117     const uint32_t simulationMaxLoc = program->simulate(true, m_subgroupSize, ref);
7118     log << tcu::TestLog::Message << "simulated maxLoc:       " << simulationMaxLoc << tcu::TestLog::EndMessage;
7119     log << tcu::TestLog::Message << "effective patch size:   " << m_data.sizeY << tcu::TestLog::EndMessage;
7120     log << tcu::TestLog::Message << "effective patch count:  " << primitiveStride << tcu::TestLog::EndMessage;
7121     log << tcu::TestLog::Message << "total invocation count: " << invocationStride << tcu::TestLog::EndMessage;
7122 
7123     // maxLoc is per-invocation. Add one (to make sure no additional writes are done).
7124     uint32_t maxLoc = simulationMaxLoc;
7125     maxLoc += 1;
7126     maxLoc *= invocationStride;
7127 
7128     constexpr uint32_t bufferCount = 3;
7129     enum Bindings
7130     {
7131         InputA,
7132         OutputBallots,
7133         OutputCounts,
7134     };
7135 
7136     de::MovePtr<BufferWithMemory> buffers[bufferCount];
7137     vk::VkDescriptorBufferInfo bufferDescriptors[bufferCount];
7138 
7139     VkDeviceSize sizes[bufferCount]{
7140         // InputA  { uint    a[]; } inputA;  filled with a[i] == i
7141         invocationStride * sizeof(uint32_t),
7142         // OutputB { uvec2   b[]; } outputB;
7143         maxLoc * sizeof(uint64_t),
7144         // OutputC { uint loc[]; } outputC;
7145         invocationStride * sizeof(uint32_t),
7146     };
7147 
7148     VkBufferUsageFlags usages[bufferCount]{
7149         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7150         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7151         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7152     };
7153 
7154     // allocate buffers
7155     for (uint32_t i = 0; i < bufferCount; ++i)
7156     {
7157         if (sizes[i] > limits.maxStorageBufferRange)
7158             TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
7159 
7160         try
7161         {
7162             buffers[i] = de::MovePtr<BufferWithMemory>(
7163                 new BufferWithMemory(vk, device, allocator,
7164                                      makeBufferCreateInfo(sizes[i], usages[i] | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
7165                                                                         VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
7166                                      MemoryRequirement::HostVisible | MemoryRequirement::Cached));
7167         }
7168         catch (tcu::ResourceError &)
7169         {
7170             // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7171             return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
7172                                    "Failed device memory allocation " + de::toString(sizes[i]) + " bytes");
7173         }
7174         bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, sizes[i]);
7175     }
7176 
7177     // get raw pointers to previously allocated buffers
7178     void *ptrs[bufferCount];
7179     for (uint32_t i = 0; i < bufferCount; ++i)
7180     {
7181         ptrs[i] = (uint32_t *)buffers[i]->getAllocation().getHostPtr();
7182     }
7183 
7184     // populate buffers with their destination
7185     {
7186         auto rangeBufferA = makeStdBeginEnd<uint32_t>(ptrs[InputA], invocationStride);
7187         std::iota(rangeBufferA.first, rangeBufferA.second, 0u);
7188     }
7189     deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
7190     deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
7191 
7192     // (...) and flush them to the GPU
7193     for (uint32_t i = 0; i < bufferCount; ++i)
7194     {
7195         flushAlloc(vk, device, buffers[i]->getAllocation());
7196     }
7197 
7198     VkDescriptorType descTypes[bufferCount]{
7199         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7200         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7201         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7202     };
7203 
7204     vk::DescriptorSetLayoutBuilder layoutBuilder;
7205     for (uint32_t i = 0; i < bufferCount; ++i)
7206     {
7207         layoutBuilder.addSingleBinding(descTypes[i], m_data.shaderStage);
7208     }
7209     vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
7210 
7211     vk::DescriptorPoolBuilder poolBuilder;
7212     for (uint32_t i = 0; i < bufferCount; ++i)
7213     {
7214         poolBuilder.addType(descTypes[i], 1);
7215     }
7216     vk::Unique<vk::VkDescriptorPool> descriptorPool(
7217         poolBuilder.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
7218     vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
7219 
7220     vk::DescriptorSetUpdateBuilder setUpdateBuilder;
7221     for (uint32_t i = 0; i < bufferCount; ++i)
7222     {
7223         setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(i), descTypes[i],
7224                                      &bufferDescriptors[i]);
7225     }
7226     setUpdateBuilder.update(vk, device);
7227 
7228     const VkPushConstantRange pushConstantRange{
7229         (VkShaderStageFlags)m_data.shaderStage, // VkShaderStageFlags stageFlags;
7230         0u,                                     // uint32_t offset;
7231         sizeof(PushConstant)                    // uint32_t size;
7232     };
7233 
7234     // TODO: verify that PushConstant is available on running machine
7235 
7236     const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{
7237         VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
7238         nullptr,                                       // pNext
7239         (VkPipelineLayoutCreateFlags)0,                // flags
7240         1u,                                            // setLayoutCount
7241         &descriptorSetLayout.get(),                    // pSetLayouts
7242         1u,                                            // pushConstantRangeCount
7243         &pushConstantRange,                            // pPushConstantRanges
7244     };
7245 
7246     const uint32_t imageWidth  = 256;
7247     const uint32_t imageHeight = 256;
7248     const VkFormat format      = VK_FORMAT_R8G8B8A8_UNORM;
7249     const VkImageCreateInfo imageCreateInfo{
7250         VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // VkStructureType sType;
7251         nullptr,                             // const void* pNext;
7252         VkImageCreateFlags(0),               // VkImageCreateFlags flags;
7253         VK_IMAGE_TYPE_2D,                    // VkImageType imageType;
7254         format,                              // VkFormat format;
7255         {imageWidth, imageHeight, 1u},       // VkExtent3D extent;
7256         1u,                                  // uint32_t mipLevels;
7257         1u,                                  // uint32_t arrayLayers;
7258         VK_SAMPLE_COUNT_1_BIT,               // VkSampleCountFlagBits samples;
7259         VK_IMAGE_TILING_OPTIMAL,             // VkImageTiling tiling;
7260         VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, // VkImageUsageFlags usage;
7261         VK_SHARING_MODE_EXCLUSIVE,           // VkSharingMode sharingMode;
7262         0u,                                  // uint32_t queueFamilyIndexCount;
7263         0u,                                  // const uint32_t* pQueueFamilyIndices;
7264         VK_IMAGE_LAYOUT_UNDEFINED            // VkImageLayout initialLayout;
7265     };
7266     const VkImageSubresourceRange rscRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
7267     de::MovePtr<ImageWithMemory> image(
7268         new ImageWithMemory(vk, device, allocator, imageCreateInfo, vk::MemoryRequirement::Any));
7269     Move<VkImageView> view        = makeImageView(vk, device, **image, VK_IMAGE_VIEW_TYPE_2D, format, rscRange);
7270     Move<VkRenderPass> renderPass = makeRenderPass(vk, device, format);
7271     Move<VkFramebuffer> framebuffer =
7272         makeFramebuffer(vk, device, *renderPass, *view, m_data.sizeX, m_data.sizeY, rscRange.layerCount);
7273     const VkRenderPassBeginInfo renderBeginInfo = makeRenderPassBeginInfo(*renderPass, *framebuffer);
7274     const Shaders shaders                       = createShaders();
7275     Move<VkPipelineLayout> pipelineLayout       = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
7276     Move<VkPipeline> pipeline =
7277         createGraphicsPipeline(*pipelineLayout, *renderPass, imageWidth, imageHeight, shaders,
7278                                VK_PRIMITIVE_TOPOLOGY_PATCH_LIST, TessEvalRandomProgram::quadInvocationCount);
7279     Move<VkCommandPool> cmdPool =
7280         createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, queueIndex);
7281     Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
7282 
7283     PushConstant pc{};
7284     pc.invocationStride = 0u;
7285     pc.width            = TessEvalRandomProgram::quadInvocationCount;
7286 
7287     auto callRecordDrawingAndSubmit = std::bind(&ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit, this,
7288                                                 *cmdBuffer, *pipelineLayout, *pipeline, *descriptorSet, std::cref(pc),
7289                                                 std::cref(renderBeginInfo), **vertexBuffer, vertexCount, **image);
7290 
7291     // compute "maxLoc", which is a potential maximum number of locations written
7292     callRecordDrawingAndSubmit();
7293 
7294     // Take the maximum of "maxLoc" over all invocations.
7295     invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7296     auto rangeLoc                       = makeStdBeginEnd<const uint32_t>(ptrs[OutputCounts], invocationStride);
7297     const uint32_t computedShaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
7298     log << tcu::TestLog::Message << "computed shaderMaxLoc: " << computedShaderMaxLoc << tcu::TestLog::EndMessage;
7299 
7300     // If we need more space, reallocate OutputB::b[] aka buffers[1]
7301     if (computedShaderMaxLoc > simulationMaxLoc)
7302     {
7303         // Add one (to make sure no additional writes are done) and multiply by
7304         // the number of invocations and current primitive count
7305         maxLoc               = (computedShaderMaxLoc + 1) * invocationStride;
7306         sizes[OutputBallots] = maxLoc * sizeof(uint64_t);
7307 
7308         if (sizes[OutputBallots] > limits.maxStorageBufferRange)
7309             TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
7310 
7311         try
7312         {
7313             buffers[OutputBallots] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
7314                 vk, device, allocator,
7315                 makeBufferCreateInfo(sizes[1], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
7316                                                    VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
7317                 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
7318         }
7319         catch (tcu::ResourceError &)
7320         {
7321             // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7322             return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
7323                                    "Failed device memory allocation " + de::toString(sizes[OutputBallots]) + " bytes");
7324         }
7325         bufferDescriptors[OutputBallots] = makeDescriptorBufferInfo(**buffers[OutputBallots], 0, sizes[OutputBallots]);
7326         ptrs[OutputBallots]              = buffers[OutputBallots]->getAllocation().getHostPtr();
7327 
7328         vk::DescriptorSetUpdateBuilder setUpdateBuilder2;
7329         setUpdateBuilder2.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(OutputBallots),
7330                                       VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[OutputBallots]);
7331         setUpdateBuilder2.update(vk, device);
7332     }
7333 
7334     // Clear any writes to ballots/stores OutputB::b[] aka buffer[1] during the counting pass
7335     // Note that its size would may change since the first memory allocation
7336     deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
7337     // Clear any writes to counting OutputC::loc[] aka buffer[2] during the counting pass
7338     deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
7339 
7340     // flush them all to the GPU
7341     flushAlloc(vk, device, buffers[OutputBallots]->getAllocation());
7342     flushAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7343 
7344     // run the actual shader with updated PushConstant
7345     pc.invocationStride = invocationStride;
7346     pc.width            = TessEvalRandomProgram::quadInvocationCount;
7347     callRecordDrawingAndSubmit();
7348 
7349     invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7350     const uint32_t finalShaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
7351     log << tcu::TestLog::Message << "final shaderMaxLoc: " << finalShaderMaxLoc << tcu::TestLog::EndMessage;
7352     if (finalShaderMaxLoc > computedShaderMaxLoc)
7353     {
7354         std::stringstream s;
7355         s << "maxLoc differs across shader invocations: " << finalShaderMaxLoc << " and " << computedShaderMaxLoc;
7356         return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING, s.str());
7357     }
7358 
7359     invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7360     invalidateAlloc(vk, device, buffers[OutputBallots]->getAllocation());
7361 
7362     // Simulate execution on the CPU, and compare against the GPU result
7363     try
7364     {
7365         ref.resize(maxLoc, 0ull);
7366     }
7367     catch (const std::bad_alloc &)
7368     {
7369         // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7370         return tcu::TestStatus(QP_TEST_RESULT_NOT_SUPPORTED,
7371                                "Failed system memory allocation " + de::toString(maxLoc * sizeof(uint64_t)) + " bytes");
7372     }
7373 
7374     program->simulate(false, m_subgroupSize, ref);
7375 
7376     const uint64_t *ballots = static_cast<uint64_t *>(ptrs[OutputBallots]);
7377     qpTestResult res        = calculateAndLogResult(ballots, ref, invocationStride, m_subgroupSize, finalShaderMaxLoc,
7378                                                     (invocationStride / 3), PrintMode::None);
7379 
7380     return tcu::TestStatus(res, qpGetTestResultName(res));
7381 }
7382 
createVertexBufferAndFlush(uint32_t cellsHorz,uint32_t cellsVert,VkPrimitiveTopology topology)7383 de::MovePtr<BufferWithMemory> ReconvergenceTestGeometryInstance::createVertexBufferAndFlush(
7384     uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
7385 {
7386     DE_UNREF(topology);
7387     DE_ASSERT(VK_PRIMITIVE_TOPOLOGY_POINT_LIST == topology);
7388     const std::vector<tcu::Vec4> vertices = GeometryRandomProgram::Arrangement::generatePrimitives(
7389         cellsHorz, cellsVert, GeometryRandomProgram::fillPercentage);
7390     return ReconvergenceTestGraphicsInstance::createVertexBufferAndFlush(vertices);
7391 }
7392 
createShaders(void)7393 std::vector<Move<VkShaderModule>> ReconvergenceTestGeometryInstance::createShaders(void)
7394 {
7395     const DeviceInterface &vk = m_context.getDeviceInterface();
7396     const VkDevice device     = m_context.getDevice();
7397 
7398     Move<VkShaderModule> vertex   = createShaderModule(vk, device, m_context.getBinaryCollection().get("vert"));
7399     Move<VkShaderModule> fragment = createShaderModule(vk, device, m_context.getBinaryCollection().get("frag"));
7400     Move<VkShaderModule> geometry = createShaderModule(vk, device, m_context.getBinaryCollection().get("test"));
7401 
7402     // { #vert, #frag, tesc, tese, #geom }; if any
7403     std::vector<Move<VkShaderModule>> shaders;
7404     shaders.emplace_back(vertex);
7405     shaders.emplace_back(fragment);
7406     shaders.emplace_back();
7407     shaders.emplace_back();
7408     shaders.emplace_back(geometry);
7409 
7410     return shaders;
7411 }
7412 
iterate(void)7413 tcu::TestStatus ReconvergenceTestGeometryInstance::iterate(void)
7414 {
7415     const VkPhysicalDeviceLimits &limits = m_context.getDeviceProperties().limits;
7416     if (sizeof(PushConstant) > limits.maxPushConstantsSize)
7417     {
7418         return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
7419                                "PushConstant size " + std::to_string(sizeof(PushConstant)) + " exceeds device limit " +
7420                                    std::to_string(limits.maxPushConstantsSize));
7421     }
7422 
7423     const DeviceInterface &vk          = m_context.getDeviceInterface();
7424     const VkDevice device              = m_context.getDevice();
7425     Allocator &allocator               = m_context.getDefaultAllocator();
7426     const uint32_t queueIndex          = m_context.getUniversalQueueFamilyIndex();
7427     add_ref<tcu::TestLog> log          = m_context.getTestContext().getLog();
7428     const VkPrimitiveTopology topology = VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
7429     const uint32_t fragmentStride      = uint32_t(m_data.sizeX * m_data.sizeY);
7430     const uint32_t invocationStride    = GeometryRandomProgram::Arrangement::calculatePrimitiveCount(
7431         m_data.sizeX, m_data.sizeY, GeometryRandomProgram::fillPercentage);
7432 
7433     de::MovePtr<GeometryRandomProgram> program(new GeometryRandomProgram(m_data));
7434     program->generateRandomProgram(m_context.getTestContext().getWatchDog(), log);
7435 
7436     // simulate content of outputP buffer
7437     std::vector<uint32_t> outputP =
7438         GeometryRandomProgram::Arrangement::generateVectorOutputP(m_subgroupSize, invocationStride);
7439 
7440     std::vector<tcu::UVec4> ref;
7441     const uint32_t hostMaxLoc = program->execute(m_context.getTestContext().getWatchDog(), true, m_subgroupSize,
7442                                                  fragmentStride, invocationStride, ref, log, outputP, nullptr);
7443     log << tcu::TestLog::Message << "Rendering area  : " << tcu::UVec2(m_data.sizeX, m_data.sizeY)
7444         << tcu::TestLog::EndMessage;
7445     log << tcu::TestLog::Message << "invocationStride: " << invocationStride << tcu::TestLog::EndMessage;
7446     log << tcu::TestLog::Message << "Simulated maxLoc: " << hostMaxLoc << tcu::TestLog::EndMessage;
7447     // maxLoc is per-invocation. Add one (to make sure no additional writes are done).
7448     uint32_t maxLoc = hostMaxLoc;
7449     maxLoc += 1;
7450     maxLoc *= invocationStride;
7451 
7452     constexpr uint32_t bufferCount = 4u;
7453     enum Bindings
7454     {
7455         InputA,
7456         OutputBallots,
7457         OutputCounts,
7458         OutputPrimitives
7459     };
7460 
7461     de::MovePtr<BufferWithMemory> buffers[bufferCount];
7462     vk::VkDescriptorBufferInfo bufferDescriptors[bufferCount];
7463 
7464     uint32_t counts[bufferCount]{// InputA  { uint    a[]; } inputA;
7465                                  uint32_t(m_data.sizeX * m_data.sizeY),
7466                                  // OutputB { uvec2   b[]; } outputB;
7467                                  maxLoc,
7468                                  // OutputC { uint loc[]; } outputC;
7469                                  invocationStride,
7470                                  // OutputP { uint p[]; } outputP;
7471                                  uint32_t(outputP.size())};
7472 
7473     VkDeviceSize sizes[bufferCount]{// InputA  { uint    a[]; } inputA;
7474                                     counts[InputA] * sizeof(uint32_t),
7475                                     // OutputB { uvec2   b[]; } outputB;
7476                                     counts[OutputBallots] * sizeof(tcu::UVec4),
7477                                     // OutputC { uint loc[]; } outputC;
7478                                     counts[OutputCounts] * sizeof(uint32_t),
7479                                     // OutputP { uint p[]; } outputP;
7480                                     counts[OutputPrimitives] * sizeof(uint32_t)};
7481 
7482     const VkBufferUsageFlags cmnUsages = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
7483     VkBufferUsageFlags usages[bufferCount]{
7484         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7485         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7486         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7487         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7488     };
7489 
7490     // allocate buffers
7491     for (uint32_t i = 0; i < bufferCount; ++i)
7492     {
7493         if (sizes[i] > limits.maxStorageBufferRange)
7494             TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
7495         try
7496         {
7497             buffers[i] = de::MovePtr<BufferWithMemory>(
7498                 new BufferWithMemory(vk, device, allocator, makeBufferCreateInfo(sizes[i], usages[i] | cmnUsages),
7499                                      MemoryRequirement::HostVisible | MemoryRequirement::Cached));
7500         }
7501         catch (tcu::ResourceError &)
7502         {
7503             // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7504             return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
7505                                    "Failed device memory allocation " + de::toString(sizes[i]) + " bytes");
7506         }
7507         bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, sizes[i]);
7508     }
7509 
7510     // get raw pointers to previously allocated buffers
7511     void *ptrs[bufferCount];
7512     for (uint32_t i = 0; i < bufferCount; ++i)
7513     {
7514         ptrs[i] = (uint32_t *)buffers[i]->getAllocation().getHostPtr();
7515     }
7516 
7517     // populate buffers with their destination
7518     {
7519         auto rangeBufferA = makeStdBeginEnd<uint32_t>(ptrs[InputA], counts[InputA]);
7520         std::iota(rangeBufferA.first, rangeBufferA.second, 0u);
7521     }
7522     deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
7523     deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
7524     deMemset(ptrs[OutputPrimitives], 0, (size_t)sizes[OutputPrimitives]);
7525 
7526     // (...) and flush them to the GPU
7527     for (uint32_t i = 0; i < bufferCount; ++i)
7528     {
7529         flushAlloc(vk, device, buffers[i]->getAllocation());
7530     }
7531 
7532     VkDescriptorType descTypes[bufferCount]{
7533         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7534         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7535         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7536         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7537     };
7538 
7539     vk::DescriptorSetLayoutBuilder layoutBuilder;
7540     for (uint32_t i = 0; i < bufferCount; ++i)
7541     {
7542         layoutBuilder.addSingleBinding(descTypes[i], m_data.shaderStage);
7543     }
7544     vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
7545 
7546     vk::DescriptorPoolBuilder poolBuilder;
7547     for (uint32_t i = 0; i < bufferCount; ++i)
7548     {
7549         poolBuilder.addType(descTypes[i], 1);
7550     }
7551     vk::Unique<vk::VkDescriptorPool> descriptorPool(
7552         poolBuilder.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
7553     vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
7554 
7555     vk::DescriptorSetUpdateBuilder setUpdateBuilder;
7556     for (uint32_t i = 0; i < bufferCount; ++i)
7557     {
7558         setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(i), descTypes[i],
7559                                      &bufferDescriptors[i]);
7560     }
7561     setUpdateBuilder.update(vk, device);
7562 
7563     const VkPushConstantRange pushConstantRange{
7564         (VkShaderStageFlags)m_data.shaderStage, // VkShaderStageFlags stageFlags;
7565         0u,                                     // uint32_t offset;
7566         sizeof(PushConstant)                    // uint32_t size;
7567     };
7568 
7569     const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{
7570         VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
7571         nullptr,                                       // pNext
7572         (VkPipelineLayoutCreateFlags)0,                // flags
7573         1u,                                            // setLayoutCount
7574         &descriptorSetLayout.get(),                    // pSetLayouts
7575         1u,                                            // pushConstantRangeCount
7576         &pushConstantRange,                            // pPushConstantRanges
7577     };
7578 
7579     const uint32_t imageWidth  = m_data.sizeX;
7580     const uint32_t imageHeight = m_data.sizeY;
7581     const VkFormat format      = VK_FORMAT_R8G8B8A8_UNORM;
7582     const VkImageCreateInfo imageCreateInfo{
7583         VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // VkStructureType sType;
7584         nullptr,                             // const void* pNext;
7585         VkImageCreateFlags(0),               // VkImageCreateFlags flags;
7586         VK_IMAGE_TYPE_2D,                    // VkImageType imageType;
7587         format,                              // VkFormat format;
7588         {imageWidth, imageHeight, 1u},       // VkExtent3D extent;
7589         1u,                                  // uint32_t mipLevels;
7590         1u,                                  // uint32_t arrayLayers;
7591         VK_SAMPLE_COUNT_1_BIT,               // VkSampleCountFlagBits samples;
7592         VK_IMAGE_TILING_OPTIMAL,             // VkImageTiling tiling;
7593         VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, // VkImageUsageFlags usage;
7594         VK_SHARING_MODE_EXCLUSIVE,           // VkSharingMode sharingMode;
7595         0u,                                  // uint32_t queueFamilyIndexCount;
7596         0u,                                  // const uint32_t* pQueueFamilyIndices;
7597         VK_IMAGE_LAYOUT_UNDEFINED            // VkImageLayout initialLayout;
7598     };
7599     const VkImageSubresourceRange rscRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
7600     de::MovePtr<ImageWithMemory> image(
7601         new ImageWithMemory(vk, device, allocator, imageCreateInfo, vk::MemoryRequirement::Any));
7602     Move<VkImageView> view        = makeImageView(vk, device, **image, VK_IMAGE_VIEW_TYPE_2D, format, rscRange);
7603     Move<VkRenderPass> renderPass = makeRenderPass(vk, device, format);
7604     Move<VkFramebuffer> framebuffer =
7605         makeFramebuffer(vk, device, *renderPass, *view, m_data.sizeX, m_data.sizeY, rscRange.layerCount);
7606     de::MovePtr<BufferWithMemory> vertexBuffer  = createVertexBufferAndFlush(m_data.sizeX, m_data.sizeY, topology);
7607     const VkRenderPassBeginInfo renderBeginInfo = makeRenderPassBeginInfo(*renderPass, *framebuffer);
7608     const Shaders shaders                       = createShaders();
7609     Move<VkPipelineLayout> pipelineLayout       = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
7610     Move<VkPipeline> pipeline = createGraphicsPipeline(*pipelineLayout, *renderPass, imageWidth, imageHeight, shaders,
7611                                                        VK_PRIMITIVE_TOPOLOGY_POINT_LIST);
7612     Move<VkCommandPool> cmdPool =
7613         createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, queueIndex);
7614     Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
7615 
7616     PushConstant pc{};
7617     pc.invocationStride      = invocationStride;
7618     pc.width                 = m_data.sizeX;
7619     pc.height                = m_data.sizeY;
7620     pc.enableInvocationIndex = VK_FALSE;
7621 
7622     auto callRecordDrawingAndSubmit = std::bind(&ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit, this,
7623                                                 *cmdBuffer, *pipelineLayout, *pipeline, *descriptorSet, std::cref(pc),
7624                                                 std::cref(renderBeginInfo), **vertexBuffer, invocationStride, **image);
7625 
7626     // compute "maxLoc", which is a potential maximum number of locations written
7627     callRecordDrawingAndSubmit();
7628 
7629     // Take the maximum of "maxLoc" over all invocations.
7630     invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7631     auto rangeLoc               = makeStdBeginEnd<const uint32_t>(ptrs[OutputCounts], invocationStride);
7632     const uint32_t shaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
7633     log << tcu::TestLog::Message << "computed maxLoc in shader: " << shaderMaxLoc << tcu::TestLog::EndMessage;
7634 
7635     // If we need more space, reallocate OutputB::b[] aka buffers[1]
7636     if (shaderMaxLoc > hostMaxLoc)
7637     {
7638         // Add one (to make sure no additional writes are done) and multiply by
7639         // the number of invocations and current primitive count
7640         maxLoc                = (std::max(shaderMaxLoc, hostMaxLoc) + 1u) * invocationStride;
7641         counts[OutputBallots] = maxLoc;
7642         sizes[OutputBallots]  = counts[OutputBallots] * sizeof(tcu::UVec4);
7643 
7644         if (sizes[OutputBallots] > limits.maxStorageBufferRange)
7645             TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
7646 
7647         try
7648         {
7649             buffers[OutputBallots] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
7650                 vk, device, allocator, makeBufferCreateInfo(sizes[1], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | cmnUsages),
7651                 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
7652         }
7653         catch (tcu::ResourceError &)
7654         {
7655             // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7656             return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
7657                                    "Failed device memory allocation " + de::toString(sizes[OutputBallots]) + " bytes");
7658         }
7659         bufferDescriptors[OutputBallots] = makeDescriptorBufferInfo(**buffers[OutputBallots], 0, sizes[OutputBallots]);
7660         ptrs[OutputBallots]              = buffers[OutputBallots]->getAllocation().getHostPtr();
7661 
7662         vk::DescriptorSetUpdateBuilder setUpdateBuilder2;
7663         setUpdateBuilder2.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(OutputBallots),
7664                                       VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[OutputBallots]);
7665         setUpdateBuilder2.update(vk, device);
7666     }
7667 
7668     // Clear any writes to ballots/stores OutputB::b[] aka buffer[1] during the counting pass
7669     // Note that its size would may change since the first memory allocation
7670     deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
7671     deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
7672     deMemset(ptrs[OutputPrimitives], 0, (size_t)sizes[OutputPrimitives]);
7673 
7674     // flush them all to the GPU
7675     flushAlloc(vk, device, buffers[OutputBallots]->getAllocation());
7676     flushAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7677     flushAlloc(vk, device, buffers[OutputPrimitives]->getAllocation());
7678 
7679     // run the actual shader with updated PushConstant
7680     pc.enableInvocationIndex = VK_TRUE;
7681     callRecordDrawingAndSubmit();
7682 
7683     invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7684     const uint32_t finalShaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
7685     log << tcu::TestLog::Message << "final shaderMaxLoc: " << finalShaderMaxLoc << tcu::TestLog::EndMessage;
7686     if (finalShaderMaxLoc != shaderMaxLoc)
7687     {
7688         return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
7689                                "maxLoc differs across shader invocations, expected: " + de::toString(shaderMaxLoc) +
7690                                    " got: " + de::toString(finalShaderMaxLoc));
7691     }
7692 
7693     invalidateAlloc(vk, device, buffers[OutputBallots]->getAllocation());
7694     const tcu::UVec4 *ballots = static_cast<tcu::UVec4 *>(ptrs[OutputBallots]);
7695 
7696     invalidateAlloc(vk, device, buffers[OutputPrimitives]->getAllocation());
7697     auto outputPrange = makeStdBeginEnd<uint32_t>(ptrs[OutputPrimitives], counts[OutputPrimitives]);
7698     std::copy(outputPrange.first, outputPrange.second, outputP.begin());
7699 
7700     try
7701     {
7702         ref.resize(counts[OutputBallots], tcu::UVec4(0u, 0u, 0u, 0u));
7703     }
7704     catch (const std::bad_alloc &)
7705     {
7706         // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7707         return tcu::TestStatus(QP_TEST_RESULT_NOT_SUPPORTED,
7708                                "Failed system memory allocation " + de::toString(maxLoc * sizeof(uint64_t)) + " bytes");
7709     }
7710 
7711     // Simulate execution on the CPU, and compare against the GPU result
7712     const uint32_t finalHostMaxLoc = program->execute(m_context.getTestContext().getWatchDog(), false, m_subgroupSize,
7713                                                       fragmentStride, invocationStride, ref, log, outputP, ballots);
7714 
7715     const qpTestResult res = calculateAndLogResultEx(log, ballots, ref, finalHostMaxLoc, PrintMode::None);
7716 
7717     return tcu::TestStatus(res, qpGetTestResultName(res));
7718 }
7719 
calculateAndLogResultEx(add_ref<tcu::TestLog> log,const tcu::UVec4 * result,const std::vector<tcu::UVec4> & ref,const uint32_t maxLoc,const PrintMode printMode)7720 qpTestResult_e ReconvergenceTestGeometryInstance::calculateAndLogResultEx(add_ref<tcu::TestLog> log,
7721                                                                           const tcu::UVec4 *result,
7722                                                                           const std::vector<tcu::UVec4> &ref,
7723                                                                           const uint32_t maxLoc,
7724                                                                           const PrintMode printMode)
7725 {
7726     DE_UNREF(maxLoc);
7727     DE_UNREF(printMode);
7728 
7729     qpTestResult res                  = QP_TEST_RESULT_PASS;
7730     uint32_t mismatchCount            = 0u;
7731     const uint32_t printMismatchCount = 5u;
7732 
7733     // With maximal reconvergence, we should expect the output to exactly match the reference.
7734     const uint32_t ballotStoreCount = static_cast<uint32_t>(ref.size());
7735     for (uint32_t i = 0; i < ballotStoreCount; ++i)
7736     {
7737         const Ballot resultVal(result[i], m_subgroupSize);
7738         const Ballot refVal(ref.at(i), m_subgroupSize);
7739         if (resultVal != refVal)
7740         {
7741             if (mismatchCount++ < printMismatchCount)
7742             {
7743                 res = QP_TEST_RESULT_FAIL;
7744                 log << tcu::TestLog::Message << "Mismatch at " << i << "\nexpected: " << resultVal
7745                     << "\n     got: " << refVal << tcu::TestLog::EndMessage;
7746                 if (printMode == PrintMode::Console)
7747                 {
7748                     std::cout << "Mismatch at " << i << "\nexpected: " << resultVal << "\n     got: " << refVal
7749                               << std::endl;
7750                 }
7751             }
7752         }
7753     }
7754 
7755     log << tcu::TestLog::Message << "Mismatch count: " << mismatchCount << " from " << ballotStoreCount
7756         << tcu::TestLog::EndMessage;
7757     if (printMode == PrintMode::Console)
7758     {
7759         std::cout << "Mismatch count: " << mismatchCount << " from " << ballotStoreCount << std::endl;
7760     }
7761 
7762     return res;
7763 }
7764 
7765 void createAmberFragmentTestCases(add_ref<tcu::TestContext> testCtx, add_ptr<tcu::TestCaseGroup> group);
7766 
createTests(tcu::TestContext & testCtx,const std::string & name,bool createExperimental)7767 tcu::TestCaseGroup *createTests(tcu::TestContext &testCtx, const std::string &name, bool createExperimental)
7768 {
7769     de::MovePtr<tcu::TestCaseGroup> group(new tcu::TestCaseGroup(testCtx, name.c_str(), "reconvergence tests"));
7770 
7771     typedef struct
7772     {
7773         uint32_t value;
7774         const char *name;
7775         const char *description;
7776     } TestGroupCase;
7777 
7778     TestGroupCase ttCases[] = {
7779         {TT_SUCF_ELECT, "subgroup_uniform_control_flow_elect", "subgroup_uniform_control_flow_elect"},
7780         {TT_SUCF_BALLOT, "subgroup_uniform_control_flow_ballot", "subgroup_uniform_control_flow_ballot"},
7781         {TT_WUCF_ELECT, "workgroup_uniform_control_flow_elect", "workgroup_uniform_control_flow_elect"},
7782         {TT_WUCF_BALLOT, "workgroup_uniform_control_flow_ballot", "workgroup_uniform_control_flow_ballot"},
7783         {TT_MAXIMAL, "maximal", "maximal"},
7784     };
7785 
7786     std::pair<VkShaderStageFlagBits, const char *> const stTypes[]{
7787         {VK_SHADER_STAGE_COMPUTE_BIT, "compute"},
7788         {VK_SHADER_STAGE_FRAGMENT_BIT, "fragment"},
7789 #ifdef INCLUDE_GRAPHICS_TESTS
7790         {VK_SHADER_STAGE_VERTEX_BIT, "vertex"},
7791         {VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT, "tessctrl"},
7792         {VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT, "tesseval"},
7793         {VK_SHADER_STAGE_GEOMETRY_BIT, "geometry"},
7794 #endif
7795     };
7796 
7797     for (int ttNdx = 0; ttNdx < DE_LENGTH_OF_ARRAY(ttCases); ttNdx++)
7798     {
7799         de::MovePtr<tcu::TestCaseGroup> ttGroup(
7800             new tcu::TestCaseGroup(testCtx, ttCases[ttNdx].name, ttCases[ttNdx].description));
7801 
7802         for (int stNdx = 0; stNdx < DE_LENGTH_OF_ARRAY(stTypes); ++stNdx)
7803         {
7804             // Only 'maximal' tests can process this loop when we are dealing with various kind of shaders,
7805             if (stTypes[stNdx].first != VK_SHADER_STAGE_COMPUTE_BIT && ttCases[ttNdx].value != TT_MAXIMAL)
7806                 continue;
7807 
7808             de::MovePtr<tcu::TestCaseGroup> shaderGroup(new tcu::TestCaseGroup(testCtx, stTypes[stNdx].second, ""));
7809 
7810             uint32_t nNdx = 2;
7811 
7812             if (stTypes[stNdx].first == VK_SHADER_STAGE_FRAGMENT_BIT)
7813             {
7814                 nNdx = 7;
7815                 createAmberFragmentTestCases(testCtx, shaderGroup.get());
7816             }
7817 
7818             for (/*uint32_t nNdx = 2*/; nNdx <= 6; nNdx++)
7819             {
7820                 de::MovePtr<tcu::TestCaseGroup> nestGroup(
7821                     new tcu::TestCaseGroup(testCtx, ("nesting" + de::toString(nNdx)).c_str(), ""));
7822 
7823                 uint32_t seed = 0;
7824 
7825                 for (int sNdx = 0; sNdx < 8; sNdx++)
7826                 {
7827                     de::MovePtr<tcu::TestCaseGroup> seedGroup(
7828                         new tcu::TestCaseGroup(testCtx, de::toString(sNdx).c_str(), ""));
7829 
7830                     uint32_t numTests = 0;
7831                     switch (nNdx)
7832                     {
7833                     default:
7834                         DE_ASSERT(0);
7835                         // fallthrough
7836                     case 2:
7837                     case 3:
7838                     case 4:
7839                         numTests = 250;
7840                         break;
7841                     case 5:
7842                         numTests = 100;
7843                         break;
7844                     case 6:
7845                         numTests = 50;
7846                         break;
7847                     }
7848 
7849                     if (ttCases[ttNdx].value != TT_MAXIMAL)
7850                     {
7851                         if (nNdx >= 5)
7852                             continue;
7853                     }
7854 
7855                     for (uint32_t ndx = 0; ndx < numTests; ndx++)
7856                     {
7857                         uint32_t dim = 0u;
7858                         DE_UNREF(dim);
7859                         uint32_t sizeX = 0u;
7860                         uint32_t sizeY = 0u;
7861                         switch (stTypes[stNdx].first)
7862                         {
7863                         case VK_SHADER_STAGE_COMPUTE_BIT:
7864                             // we want to test at least full subgroup
7865                             // both are primary numbers
7866                             sizeX = 7u;
7867                             sizeY = 13u;
7868                             break;
7869                         case VK_SHADER_STAGE_FRAGMENT_BIT:
7870                             sizeX = 32;
7871                             sizeY = 32;
7872                             break;
7873                         case VK_SHADER_STAGE_VERTEX_BIT:
7874                             // we want to test at least full subgroup
7875                             dim   = uint32_t(std::ceil(
7876                                 std::sqrt((double)(((128u + 31u) * 100u) / VertexRandomProgram::fillPercentage))));
7877                             sizeX = dim;
7878                             sizeY = dim;
7879                             break;
7880                         case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
7881                             sizeX = 19; // positive number of desired subgroups
7882                             sizeY = 1;  // used only for framebuffer extent in TCS test
7883                             break;
7884                         case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
7885                             sizeX = 23; // positive number of desired subgroups
7886                             sizeY = 1;  // used only for framebuffer extent in TES test
7887                             break;
7888                         case VK_SHADER_STAGE_GEOMETRY_BIT:
7889                             // we want to test at least full subgroup
7890                             dim   = uint32_t(std::ceil(
7891                                 std::sqrt((double)(((128u + 29u) * 100u) / GeometryRandomProgram::fillPercentage))));
7892                             sizeX = dim;
7893                             sizeY = dim;
7894                             break;
7895                         default:
7896                             DE_ASSERT(0);
7897                         }
7898                         CaseDef c = {
7899                             stTypes[stNdx].first,           // VkShaderStageFlagBits    shaderStage
7900                             (TestType)ttCases[ttNdx].value, // TestType testType;
7901                             nNdx,                           // uint32_t maxNesting;
7902                             seed,                           // uint32_t seed;
7903                             sizeX,                          // uint32_t sizeX;
7904                             sizeY                           // uint32_t sizeY;
7905                         };
7906                         // product of sizeX and sizeY must not exceed MAX_INVOCATIONS_ALL_TESTS
7907                         DE_ASSERT(c.verify());
7908                         seed++;
7909 
7910                         bool isExperimentalTest = (ndx >= numTests / 5);
7911 
7912                         if (createExperimental == isExperimentalTest)
7913                             seedGroup->addChild(new ReconvergenceTestCase(testCtx, de::toString(ndx).c_str(), c));
7914                     }
7915                     if (!seedGroup->empty())
7916                         nestGroup->addChild(seedGroup.release());
7917                 }
7918                 if (!nestGroup->empty())
7919                     shaderGroup->addChild(nestGroup.release());
7920             }
7921             if (!shaderGroup->empty())
7922                 ttGroup->addChild(shaderGroup.release());
7923         }
7924         group->addChild(ttGroup.release());
7925     }
7926 
7927     return group.release();
7928 }
7929 
createAmberFragmentTestCases(add_ref<tcu::TestContext> testCtx,add_ptr<tcu::TestCaseGroup> group)7930 void createAmberFragmentTestCases(add_ref<tcu::TestContext> testCtx, add_ptr<tcu::TestCaseGroup> group)
7931 {
7932     using namespace cts_amber;
7933 
7934     enum Tests
7935     {
7936         TERMINATE_INVOCATION,
7937         DEMOTE_INVOCATION,
7938         DEMOTE_ENTIRE_QUAD,
7939         DEMOTE_HALF_QUAD_TOP,
7940         DEMOTE_HALF_QUAD_RIGHT,
7941         DEMOTE_HALF_QUAD_BOTTOM,
7942         DEMOTE_HALF_QUAD_LEFT,
7943         DEMOTE_HALF_QUAD_SLASH,
7944         DEMOTE_HALF_QUAD_BACKSLASH
7945     };
7946 
7947     struct Case
7948     {
7949         Tests test;
7950         add_cptr<char> name;
7951         add_cptr<char> desc;
7952         std::size_t hname;
7953         Case(Tests aTest, add_cptr<char> aName, add_cptr<char> aDesc)
7954             : test(aTest)
7955             , name(aName)
7956             , desc(aDesc)
7957             , hname(std::hash<std::string>()(std::string(aName)))
7958         {
7959         }
7960         bool matches(add_cref<std::string> aName) const
7961         {
7962             return hname == std::hash<std::string>()(aName);
7963         }
7964         static bool matches(add_cref<std::string> aName, std::initializer_list<Case> aList)
7965         {
7966             for (auto i = aList.begin(); i != aList.end(); ++i)
7967             {
7968                 if (i->matches(aName))
7969                     return true;
7970             }
7971             return false;
7972         }
7973         std::string makeFileName() const
7974         {
7975             return (std::string(name) + ".amber");
7976         }
7977     } static const cases[]{
7978         Case(TERMINATE_INVOCATION, "terminate_invocation",
7979              "Verifies that terminated invocation is no longer included in the ballot"),
7980         Case(DEMOTE_INVOCATION, "demote_invocation",
7981              "Verifies that the demoted invocation is not present in the ballot"),
7982         Case(DEMOTE_ENTIRE_QUAD, "demote_entire_quad", "Verifies that the demoted quad is not present in the ballot"),
7983         Case(DEMOTE_HALF_QUAD_TOP, "demote_half_quad_top",
7984              "Verifies that the demoted part of the quad is not present in the ballot"),
7985         Case(DEMOTE_HALF_QUAD_RIGHT, "demote_half_quad_right",
7986              "Verifies that the demoted part of the quad is not present in the ballot"),
7987         Case(DEMOTE_HALF_QUAD_BOTTOM, "demote_half_quad_bottom",
7988              "Verifies that the demoted part of the quad is not present in the ballot"),
7989         Case(DEMOTE_HALF_QUAD_LEFT, "demote_half_quad_left",
7990              "Verifies that the demoted part of the quad is not present in the ballot"),
7991         Case(DEMOTE_HALF_QUAD_SLASH, "demote_half_quad_slash",
7992              "Verifies that the demoted part of the quad is not present in the ballot"),
7993         Case(DEMOTE_HALF_QUAD_BACKSLASH, "demote_half_quad_backslash",
7994              "Verifies that the demoted part of the quad is not present in the ballot"),
7995     };
7996 
7997     auto testSupports = [](Context &context, std::string testName) -> void
7998     {
7999         if (!(context.getSubgroupProperties().supportedStages & VK_SHADER_STAGE_FRAGMENT_BIT))
8000             TCU_THROW(NotSupportedError, "Subgroup operations not supported in fragment stage");
8001 
8002         if (!context.getShaderMaximalReconvergenceFeatures().shaderMaximalReconvergence)
8003             TCU_THROW(NotSupportedError, "shaderMaximalReconvergence not supported");
8004 
8005         if (!(context.getSubgroupProperties().supportedOperations & VK_SUBGROUP_FEATURE_BALLOT_BIT))
8006             TCU_THROW(NotSupportedError, "VK_SUBGROUP_FEATURE_BALLOT_BIT not supported");
8007 
8008         if (Case::matches(testName, {cases[DEMOTE_ENTIRE_QUAD]}))
8009         {
8010             if (!(context.getSubgroupProperties().subgroupSize > 4))
8011                 TCU_THROW(NotSupportedError, "subgroupSize is less than or equal to 4");
8012         }
8013         else
8014         {
8015             if (!(context.getSubgroupProperties().subgroupSize >= 4))
8016                 TCU_THROW(NotSupportedError, "subgroupSize is less than 4");
8017         }
8018 
8019         if (Case::matches(testName, {cases[TERMINATE_INVOCATION]}))
8020         {
8021             if (!context.getShaderTerminateInvocationFeatures().shaderTerminateInvocation)
8022                 TCU_THROW(NotSupportedError, "shaderTerminateInvocation not supported.");
8023         }
8024         else
8025         {
8026 #ifndef CTS_USES_VULKANSC
8027             if (!context.getShaderDemoteToHelperInvocationFeatures().shaderDemoteToHelperInvocation)
8028                 TCU_THROW(NotSupportedError, "demoteToHelperInvocation not supported.");
8029 #else
8030             if (!context.getShaderDemoteToHelperInvocationFeaturesEXT().shaderDemoteToHelperInvocation)
8031                 TCU_THROW(NotSupportedError, "demoteToHelperInvocation not supported.");
8032 #endif
8033         }
8034     };
8035 
8036     auto updateTest = [&](add_ptr<AmberTestCase> theTest) -> add_ptr<AmberTestCase>
8037     {
8038         theTest->setCheckSupportCallback(testSupports);
8039         return theTest;
8040     };
8041 
8042     const std::string testsFolder(std::string("reconvergence/maximal/") + group->getName());
8043 
8044     for (add_cref<Case> aCase : cases)
8045     {
8046         group->addChild(updateTest(
8047             createAmberTestCase(testCtx, aCase.name, aCase.desc, testsFolder.c_str(), aCase.makeFileName())));
8048     }
8049 }
8050 
8051 } // namespace
8052 
createTests(tcu::TestContext & testCtx,const std::string & name)8053 tcu::TestCaseGroup *createTests(tcu::TestContext &testCtx, const std::string &name)
8054 {
8055     return createTests(testCtx, name, false);
8056 }
8057 
createTestsExperimental(tcu::TestContext & testCtx,const std::string & name)8058 tcu::TestCaseGroup *createTestsExperimental(tcu::TestContext &testCtx, const std::string &name)
8059 {
8060     return createTests(testCtx, name, true);
8061 }
8062 
8063 } // namespace Reconvergence
8064 } // namespace vkt
8065