1 /*------------------------------------------------------------------------
2 * Vulkan Conformance Tests
3 * ------------------------
4 *
5 * Copyright (c) 2019 The Khronos Group Inc.
6 * Copyright (c) 2018-2020 NVIDIA Corporation
7 *
8 * Licensed under the Apache License, Version 2.0 (the "Licensehelper
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
11 *
12 * http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
19 *
20 * \file
21 * \brief Vulkan Reconvergence tests
22 *//*--------------------------------------------------------------------*/
23
24 #include "vktReconvergenceTests.hpp"
25
26 #include "vkBufferWithMemory.hpp"
27 #include "vkImageWithMemory.hpp"
28 #include "vkQueryUtil.hpp"
29 #include "vkBuilderUtil.hpp"
30 #include "vkCmdUtil.hpp"
31 #include "vkTypeUtil.hpp"
32 #include "vkObjUtil.hpp"
33
34 #include "vktTestGroupUtil.hpp"
35 #include "vktTestCase.hpp"
36 #include "vktAmberTestCase.hpp"
37
38 #include "deDefs.h"
39 #include "deFloat16.h"
40 #include "deMath.h"
41 #include "deRandom.h"
42 #include "deSharedPtr.hpp"
43 #include "deString.h"
44
45 #include "tcuTestCase.hpp"
46 #include "tcuTestLog.hpp"
47
48 #include <array>
49 #include <bitset>
50 #include <functional>
51 #include <map>
52 #include <numeric>
53 #include <random>
54 #include <string>
55 #include <sstream>
56 #include <set>
57 #include <type_traits>
58 #include <vector>
59 #include <memory>
60 #include <cmath>
61 #include <initializer_list>
62
63 #include <iostream>
64
65 // #define INCLUDE_GRAPHICS_TESTS
66
67 namespace vkt
68 {
69 namespace Reconvergence
70 {
71 namespace
72 {
73 using namespace vk;
74 using namespace std;
75
76 #define ARRAYSIZE(x) (sizeof(x) / sizeof(x[0]))
77 #define ROUNDUP(x__, multipler__) ((((x__) + ((multipler__)-1)) / (multipler__)) * (multipler__))
78 #define ROUNDDOWN(x__, multipler__) (((x__) / (multipler__)) * (multipler__))
79 constexpr uint32_t MAX_INVOCATIONS_ALL_TESTS = 64 * 64;
80 typedef std::bitset<MAX_INVOCATIONS_ALL_TESTS> bitset_inv_t;
81 //constexpr bitset_inv_t MAGIC_BALLOT = 0x12345678;
82
83 typedef enum
84 {
85 TT_SUCF_ELECT, // subgroup_uniform_control_flow using elect (subgroup_basic)
86 TT_SUCF_BALLOT, // subgroup_uniform_control_flow using ballot (subgroup_ballot)
87 TT_WUCF_ELECT, // workgroup uniform control flow using elect (subgroup_basic)
88 TT_WUCF_BALLOT, // workgroup uniform control flow using ballot (subgroup_ballot)
89 TT_MAXIMAL, // maximal reconvergence
90 } TestType;
91
92 static_assert(VK_TRUE == 1, "VK_TRUE must equal 1");
93
94 struct CaseDef
95 {
96 VkShaderStageFlagBits shaderStage;
97 TestType testType;
98 uint32_t maxNesting;
99 uint32_t seed;
100 // In the case of compute shader below sizes would be local_size_x and local_size_y respectively.
101 // In the case of fragment shader these sizes would define framebuffer dimensions.
102 uint32_t sizeX;
103 uint32_t sizeY;
104
isWUCFvkt::Reconvergence::__anone030def80111::CaseDef105 bool isWUCF() const
106 {
107 return testType == TT_WUCF_ELECT || testType == TT_WUCF_BALLOT;
108 }
isSUCFvkt::Reconvergence::__anone030def80111::CaseDef109 bool isSUCF() const
110 {
111 return testType == TT_SUCF_ELECT || testType == TT_SUCF_BALLOT;
112 }
isUCFvkt::Reconvergence::__anone030def80111::CaseDef113 bool isUCF() const
114 {
115 return isWUCF() || isSUCF();
116 }
isElectvkt::Reconvergence::__anone030def80111::CaseDef117 bool isElect() const
118 {
119 return testType == TT_WUCF_ELECT || testType == TT_SUCF_ELECT;
120 }
121
verifyvkt::Reconvergence::__anone030def80111::CaseDef122 bool verify() const
123 {
124 return (sizeX * sizeY) <= MAX_INVOCATIONS_ALL_TESTS;
125 }
126 };
127
128 template <class T, class P = T (*)[1], class R = decltype(std::begin(*std::declval<P>()))>
makeStdBeginEnd(void * p,uint32_t n)129 static auto makeStdBeginEnd(void *p, uint32_t n) -> std::pair<R, R>
130 {
131 auto tmp = std::begin(*P(p));
132 auto begin = tmp;
133 std::advance(tmp, n);
134 return {begin, tmp};
135 }
136
137 template <class R>
138 using add_ref = typename std::add_lvalue_reference<R>::type;
139 template <class R>
140 using add_cref = typename std::add_lvalue_reference<typename std::add_const<R>::type>::type;
141 template <class X>
142 using add_ptr = std::add_pointer_t<X>;
143 template <class X>
144 using add_cptr = std::add_pointer_t<std::add_const_t<X>>;
145
146 template <class RndIter>
max_element(RndIter first,RndIter last)147 RndIter max_element(RndIter first, RndIter last)
148 {
149 RndIter max = last;
150 if (first != last)
151 {
152 for (max = first, ++first; first != last; ++first)
153 {
154 if (*first > *max)
155 max = first;
156 }
157 }
158 return max;
159 }
160
161 template <class RndIter, class Selector>
max_element(RndIter first,RndIter last,Selector selector)162 RndIter max_element(RndIter first, RndIter last, Selector selector)
163 {
164 RndIter max = last;
165 if (first != last)
166 {
167 for (max = first, ++first; first != last; ++first)
168 {
169 if (selector(*first) > selector(*max))
170 max = first;
171 }
172 }
173 return max;
174 }
175
176 struct Ballot : public std::bitset<128>
177 {
178 typedef std::bitset<128> super;
Ballotvkt::Reconvergence::__anone030def80111::Ballot179 Ballot() : super()
180 {
181 }
Ballotvkt::Reconvergence::__anone030def80111::Ballot182 Ballot(add_cref<super> ballot, uint32_t printbits = 128u) : super(ballot), m_bits(printbits)
183 {
184 }
Ballotvkt::Reconvergence::__anone030def80111::Ballot185 Ballot(add_cref<tcu::UVec4> ballot, uint32_t printbits = 128u) : super(), m_bits(printbits)
186 {
187 *this = ballot;
188 }
Ballotvkt::Reconvergence::__anone030def80111::Ballot189 Ballot(uint64_t val, uint32_t printbits = 128u) : super(val), m_bits(printbits)
190 {
191 }
withSetBitvkt::Reconvergence::__anone030def80111::Ballot192 static Ballot withSetBit(uint32_t bit)
193 {
194 Ballot b;
195 b.set(bit);
196 return b;
197 }
sizevkt::Reconvergence::__anone030def80111::Ballot198 constexpr uint32_t size() const
199 {
200 return static_cast<uint32_t>(super::size());
201 }
operator tcu::UVec4vkt::Reconvergence::__anone030def80111::Ballot202 operator tcu::UVec4() const
203 {
204 tcu::UVec4 result;
205 super ballot(*this);
206 const super mask = 0xFFFFFFFF;
207 for (uint32_t k = 0; k < 4u; ++k)
208 {
209 result[k] = uint32_t((ballot & mask).to_ulong());
210 ballot >>= 32;
211 }
212 return result;
213 }
operator =vkt::Reconvergence::__anone030def80111::Ballot214 add_ref<Ballot> operator=(add_cref<tcu::UVec4> vec)
215 {
216 for (uint32_t k = 0; k < 4u; ++k)
217 {
218 (*this) <<= 32;
219 (*this) |= vec[3 - k];
220 }
221 return *this;
222 }
getwvkt::Reconvergence::__anone030def80111::Ballot223 DE_UNUSED_FUNCTION uint32_t getw() const
224 {
225 return m_bits;
226 }
setwvkt::Reconvergence::__anone030def80111::Ballot227 DE_UNUSED_FUNCTION void setw(uint32_t bits)
228 {
229 m_bits = bits;
230 }
operator <<(add_ref<std::ostream> str,add_cref<Ballot> ballot)231 DE_UNUSED_FUNCTION friend add_ref<std::ostream> operator<<(add_ref<std::ostream> str, add_cref<Ballot> ballot)
232 {
233 for (uint32_t i = 0u; i < ballot.m_bits && i < 128u; ++i)
234 {
235 str << (ballot[ballot.m_bits - i - 1u] ? '1' : '0');
236 }
237 return str;
238 }
239
240 protected:
241 uint32_t m_bits;
242 };
243
244 struct Ballots : protected std::vector<std::bitset<128>>
245 {
246 typedef std::vector<value_type> super;
247 static const constexpr uint32_t subgroupInvocationSize = static_cast<uint32_t>(value_type().size());
Ballotsvkt::Reconvergence::__anone030def80111::Ballots248 Ballots() : super()
249 {
250 }
Ballotsvkt::Reconvergence::__anone030def80111::Ballots251 explicit Ballots(uint32_t subgroupCount, add_cref<value_type> ballot = {}) : super(subgroupCount)
252 {
253 if (ballot.any())
254 *this = ballot;
255 }
Ballotsvkt::Reconvergence::__anone030def80111::Ballots256 Ballots(add_cref<Ballots> other) : super(upcast(other))
257 {
258 }
Ballotsvkt::Reconvergence::__anone030def80111::Ballots259 Ballots(Ballots &&other) : super(std::move(other))
260 {
261 }
262 using super::operator[];
263 using super::at;
264 /**
265 * @brief size method
266 * @return Returns the number of bits that the Ballots holds.
267 */
sizevkt::Reconvergence::__anone030def80111::Ballots268 uint32_t size() const
269 {
270 return static_cast<uint32_t>(super::size() * subgroupInvocationSize);
271 }
272 /**
273 * @brief count method
274 * @return Returns the number of bits that are set to true.
275 */
countvkt::Reconvergence::__anone030def80111::Ballots276 uint32_t count() const
277 {
278 uint32_t n = 0u;
279 for (add_cref<value_type> b : *this)
280 n += static_cast<uint32_t>(b.count());
281 return n;
282 }
283 /**
284 * @brief count method
285 * @return Returns the number of bits that are set to true in given subgroup.
286 */
countvkt::Reconvergence::__anone030def80111::Ballots287 uint32_t count(uint32_t subgroup) const
288 {
289 DE_ASSERT(subgroup < subgroupCount());
290 return static_cast<uint32_t>(at(subgroup).count());
291 }
subgroupCountvkt::Reconvergence::__anone030def80111::Ballots292 uint32_t subgroupCount() const
293 {
294 return static_cast<uint32_t>(super::size());
295 }
testvkt::Reconvergence::__anone030def80111::Ballots296 bool test(uint32_t bit) const
297 {
298 DE_ASSERT(bit < size());
299 return at(bit / subgroupInvocationSize).test(bit % subgroupInvocationSize);
300 }
setvkt::Reconvergence::__anone030def80111::Ballots301 bool set(uint32_t bit, bool value = true)
302 {
303 DE_ASSERT(bit <= size());
304 const bool before = test(bit);
305 at(bit / subgroupInvocationSize).set((bit % subgroupInvocationSize), value);
306 return before;
307 }
fullvkt::Reconvergence::__anone030def80111::Ballots308 void full()
309 {
310 const uint32_t bb = size();
311 for (uint32_t b = 0u; b < bb; ++b)
312 set(b);
313 }
setnvkt::Reconvergence::__anone030def80111::Ballots314 add_ref<Ballots> setn(uint32_t bits)
315 {
316 for (uint32_t i = 0u; i < bits; ++i)
317 set(i);
318 return *this;
319 }
allvkt::Reconvergence::__anone030def80111::Ballots320 bool all() const
321 {
322 const uint32_t gg = subgroupCount();
323 for (uint32_t g = 0u; g < gg; ++g)
324 {
325 if (false == at(g).all())
326 return false;
327 }
328 return (gg != 0u);
329 }
nonevkt::Reconvergence::__anone030def80111::Ballots330 bool none() const
331 {
332 const uint32_t gg = subgroupCount();
333 for (uint32_t g = 0u; g < gg; ++g)
334 {
335 if (false == at(g).none())
336 return false;
337 }
338 return (gg != 0u);
339 }
anyvkt::Reconvergence::__anone030def80111::Ballots340 bool any() const
341 {
342 bool res = false;
343 const uint32_t gg = subgroupCount();
344 for (uint32_t g = 0u; g < gg; ++g)
345 res |= super::at(g).any();
346 return res;
347 }
findBitvkt::Reconvergence::__anone030def80111::Ballots348 static uint32_t findBit(uint32_t otherFullyQualifiedInvocationID, uint32_t otherSubgroupSize)
349 {
350 return (((otherFullyQualifiedInvocationID / otherSubgroupSize) * subgroupInvocationSize) +
351 (otherFullyQualifiedInvocationID % otherSubgroupSize));
352 }
upcastvkt::Reconvergence::__anone030def80111::Ballots353 inline add_cref<super> upcast(add_cref<Ballots> other) const
354 {
355 return static_cast<add_cref<super>>(other);
356 }
operator &=vkt::Reconvergence::__anone030def80111::Ballots357 add_ref<Ballots> operator&=(add_cref<Ballots> other)
358 {
359 DE_ASSERT(subgroupCount() == other.subgroupCount());
360 const uint32_t gg = subgroupCount();
361 for (uint32_t g = 0u; g < gg; ++g)
362 super::at(g) = super::at(g) & upcast(other).at(g);
363 return *this;
364 }
operator &vkt::Reconvergence::__anone030def80111::Ballots365 Ballots operator&(add_cref<Ballots> other) const
366 {
367 Ballots res(*this);
368 res &= other;
369 return res;
370 }
operator |=vkt::Reconvergence::__anone030def80111::Ballots371 add_ref<Ballots> operator|=(add_cref<Ballots> other)
372 {
373 DE_ASSERT(subgroupCount() == other.subgroupCount());
374 const uint32_t gg = subgroupCount();
375 for (uint32_t g = 0u; g < gg; ++g)
376 super::at(g) = super::at(g) | upcast(other).at(g);
377 return *this;
378 }
operator |vkt::Reconvergence::__anone030def80111::Ballots379 Ballots operator|(add_cref<Ballots> other) const
380 {
381 Ballots res(*this);
382 res |= other;
383 return res;
384 }
operator <<=vkt::Reconvergence::__anone030def80111::Ballots385 add_ref<Ballots> operator<<=(uint32_t bits)
386 {
387 return ((*this) = ((*this) << bits));
388 }
operator <<vkt::Reconvergence::__anone030def80111::Ballots389 Ballots operator<<(uint32_t bits) const
390 {
391 Ballots res(subgroupCount());
392 if (bits < size() && bits != 0u)
393 {
394 for (uint32_t b = 0; b < bits; ++b)
395 res.set((b + bits), test(b));
396 }
397 return res;
398 }
operator ~vkt::Reconvergence::__anone030def80111::Ballots399 Ballots operator~() const
400 {
401 Ballots res(*this);
402 const uint32_t gg = subgroupCount();
403 for (uint32_t g = 0u; g < gg; ++g)
404 res.at(g) = super::at(g).operator~();
405 return res;
406 }
operator ==vkt::Reconvergence::__anone030def80111::Ballots407 bool operator==(add_cref<Ballots> other) const
408 {
409 if (super::size() == upcast(other).size())
410 {
411 const uint32_t gg = subgroupCount();
412 for (uint32_t g = 0u; g < gg; ++g)
413 {
414 if (at(g) != other[g])
415 return false;
416 }
417 return true;
418 }
419 return false;
420 }
operator =vkt::Reconvergence::__anone030def80111::Ballots421 add_ref<Ballots> operator=(add_cref<Ballots> other)
422 {
423 DE_ASSERT((subgroupCount() == other.subgroupCount()));
424 const uint32_t gg = subgroupCount();
425 for (uint32_t g = 0u; g < gg; ++g)
426 at(g) = other.at(g);
427 return *this;
428 }
operator =vkt::Reconvergence::__anone030def80111::Ballots429 add_ref<Ballots> operator=(add_cref<value_type> forAllGroups)
430 {
431 DE_ASSERT(super::size() >= 1u);
432 const uint32_t gg = subgroupCount();
433 for (uint32_t g = 0u; g < gg; ++g)
434 at(g) = forAllGroups;
435 return *this;
436 }
437 };
438
subgroupSizeToMask(uint32_t subgroupSize)439 uint64_t subgroupSizeToMask(uint32_t subgroupSize)
440 {
441 if (subgroupSize == 64)
442 return ~0ULL;
443 else
444 return (1ULL << subgroupSize) - 1;
445 }
446
subgroupSizeToMask(uint32_t subgroupSize,uint32_t subgroupCount)447 Ballot subgroupSizeToMask(uint32_t subgroupSize, uint32_t subgroupCount)
448 {
449 DE_UNREF(subgroupCount);
450 Ballot b;
451 DE_ASSERT(subgroupSize <= b.size());
452 for (uint32_t i = 0; i < subgroupSize; ++i)
453 b.set(i);
454 return b;
455 }
456
457 // Take a 64-bit integer, mask it to the subgroup size, and then
458 // replicate it for each subgroup
bitsetFromU64(uint64_t mask,uint32_t subgroupSize)459 bitset_inv_t bitsetFromU64(uint64_t mask, uint32_t subgroupSize)
460 {
461 mask &= subgroupSizeToMask(subgroupSize);
462 bitset_inv_t result(mask);
463 for (uint32_t i = 0; i < result.size() / subgroupSize - 1; ++i)
464 {
465 result = (result << subgroupSize) | bitset_inv_t(mask);
466 }
467 return result;
468 }
469
ballotsFromU64(uint64_t maskValue,uint32_t subgroupSize,uint32_t subgroupCount)470 Ballots ballotsFromU64(uint64_t maskValue, uint32_t subgroupSize, uint32_t subgroupCount)
471 {
472 Ballot b(maskValue);
473 b &= subgroupSizeToMask(subgroupSize, subgroupCount);
474 Ballots result(subgroupCount);
475 for (uint32_t g = 0; g < subgroupCount; ++g)
476 result.at(g) = b;
477 return result;
478 }
479
ballotsFromBallot(Ballot b,uint32_t subgroupSize,uint32_t subgroupCount)480 Ballots ballotsFromBallot(Ballot b, uint32_t subgroupSize, uint32_t subgroupCount)
481 {
482 b &= subgroupSizeToMask(subgroupSize, subgroupCount);
483 Ballots result(subgroupCount);
484 for (uint32_t g = 0; g < subgroupCount; ++g)
485 result.at(g) = b;
486 return result;
487 }
488
489 // Pick out the mask for the subgroup that invocationID is a member of
bitsetToU64(const bitset_inv_t & bitset,uint32_t subgroupSize,uint32_t invocationID)490 uint64_t bitsetToU64(const bitset_inv_t &bitset, uint32_t subgroupSize, uint32_t invocationID)
491 {
492 bitset_inv_t copy(bitset);
493 copy >>= (invocationID / subgroupSize) * subgroupSize;
494 copy &= bitset_inv_t(subgroupSizeToMask(subgroupSize));
495 uint64_t mask = copy.to_ullong();
496 mask &= subgroupSizeToMask(subgroupSize);
497 return mask;
498 }
499
500 // Pick out the mask for the subgroup that invocationID is a member of
bitsetToBallot(const Ballots & bitset,uint32_t subgroupSize,uint32_t invocationID)501 Ballot bitsetToBallot(const Ballots &bitset, uint32_t subgroupSize, uint32_t invocationID)
502 {
503 return bitset.at(invocationID / subgroupSize) & subgroupSizeToMask(subgroupSize, bitset.subgroupCount());
504 }
505
506 // Pick out the mask for the subgroup that invocationID is a member of
bitsetToBallot(add_cref<Ballots> bitset,add_cref<Ballot> subgroupSizeMask,uint32_t subgroupSize,uint32_t invocationID)507 Ballot bitsetToBallot(add_cref<Ballots> bitset, add_cref<Ballot> subgroupSizeMask, uint32_t subgroupSize,
508 uint32_t invocationID)
509 {
510 return bitset.at(invocationID / subgroupSize) & subgroupSizeMask;
511 }
512
bitsetToBallot(uint64_t value,uint32_t subgroupCount,uint32_t subgroupSize,uint32_t invocationID)513 Ballot bitsetToBallot(uint64_t value, uint32_t subgroupCount, uint32_t subgroupSize, uint32_t invocationID)
514 {
515 Ballots bs = ballotsFromU64(value, subgroupSize, subgroupCount);
516 return bitsetToBallot(bs, subgroupSize, invocationID);
517 }
518
findLSB(uint64_t value)519 static int findLSB(uint64_t value)
520 {
521 for (int i = 0; i < 64; i++)
522 {
523 if (value & (1ULL << i))
524 return i;
525 }
526 return -1;
527 }
528
529 template <uint32_t N>
findLSB(add_cref<std::bitset<N>> value)530 static uint32_t findLSB(add_cref<std::bitset<N>> value)
531 {
532 for (uint32_t i = 0u; i < N; ++i)
533 {
534 if (value.test(i))
535 return i;
536 }
537 return std::numeric_limits<uint32_t>::max();
538 }
539
540 // For each subgroup, pick out the elected invocationID, and accumulate
541 // a bitset of all of them
bitsetElect(const bitset_inv_t & value,int32_t subgroupSize)542 static bitset_inv_t bitsetElect(const bitset_inv_t &value, int32_t subgroupSize)
543 {
544 bitset_inv_t ret; // zero initialized
545
546 for (int32_t i = 0; i < (int32_t)value.size(); i += subgroupSize)
547 {
548 uint64_t mask = bitsetToU64(value, subgroupSize, i);
549 int lsb = findLSB(mask);
550 ret |= bitset_inv_t(lsb == -1 ? 0 : (1ULL << lsb)) << i;
551 }
552 return ret;
553 }
554
bitsetElect(add_cref<Ballots> value)555 static Ballots bitsetElect(add_cref<Ballots> value)
556 {
557 Ballots ret(value.subgroupCount());
558 for (uint32_t g = 0u; g < value.subgroupCount(); ++g)
559 {
560 const uint32_t lsb = findLSB<Ballots::subgroupInvocationSize>(value.at(g));
561 if (lsb != std::numeric_limits<uint32_t>::max())
562 {
563 ret.at(g).set(lsb);
564 }
565 }
566 return ret;
567 }
568
569 struct PushConstant
570 {
571 int32_t invocationStride;
572 uint32_t width;
573 uint32_t height;
574 uint32_t primitiveStride;
575 uint32_t subgroupStride;
576 uint32_t enableInvocationIndex;
577 };
578
579 struct Vertex
580 {
581 // Traditional POD structure that mimics a vertex.
582 // Be carefull before do any changes in this structure
583 // because it is strictly mapped to VK_FORMAT_R32G32B32A32_SFLOAT
584 // when graphics pipeline is constructed.
585 float x, y, z, w;
586 };
587
588 typedef Vertex Triangle[3];
589
590 class RandomProgram;
591 class ComputeRandomProgram;
592
getSubgroupProperties(vkt::Context & context)593 std::pair<vk::VkPhysicalDeviceSubgroupProperties, vk::VkPhysicalDeviceProperties2> getSubgroupProperties(
594 vkt::Context &context)
595 {
596 vk::VkPhysicalDeviceSubgroupProperties subgroupProperties;
597 deMemset(&subgroupProperties, 0, sizeof(subgroupProperties));
598 subgroupProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES;
599
600 vk::VkPhysicalDeviceProperties2 properties2;
601 deMemset(&properties2, 0, sizeof(properties2));
602 properties2.sType = vk::VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
603 properties2.pNext = &subgroupProperties;
604
605 context.getInstanceInterface().getPhysicalDeviceProperties2(context.getPhysicalDevice(), &properties2);
606
607 return {subgroupProperties, properties2};
608 }
609
610 class ReconvergenceTestInstance : public TestInstance
611 {
612 public:
613 // { vert, frag, tesc, tese, geom }; if any
614 using Shaders = std::vector<Move<VkShaderModule>>;
615
ReconvergenceTestInstance(Context & context,const CaseDef & data)616 ReconvergenceTestInstance(Context &context, const CaseDef &data)
617 : TestInstance(context)
618 , m_data(data)
619 , m_subgroupSize(getSubgroupProperties(context).first.subgroupSize)
620 {
621 }
622 ~ReconvergenceTestInstance(void) = default;
623
624 Move<VkPipeline> createComputePipeline(const VkPipelineLayout pipelineLayout, const VkShaderModule computeShader);
625 Move<VkPipeline> createGraphicsPipeline(const VkPipelineLayout pipelineLayout, const VkRenderPass renderPass,
626 const uint32_t width, const uint32_t height, const Shaders &shaders,
627 const VkPrimitiveTopology topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
628 const uint32_t patchControlPoints = 0u);
629
630 protected:
631 const CaseDef m_data;
632 const uint32_t m_subgroupSize;
633 };
634
635 class ReconvergenceTestComputeInstance : public ReconvergenceTestInstance
636 {
637 public:
ReconvergenceTestComputeInstance(Context & context,const CaseDef & data,std::shared_ptr<RandomProgram> program,std::map<uint32_t,uint32_t> && subgroupSizeToMaxLoc)638 ReconvergenceTestComputeInstance(Context &context, const CaseDef &data, std::shared_ptr<RandomProgram> program,
639 std::map<uint32_t, uint32_t> &&subgroupSizeToMaxLoc)
640 : ReconvergenceTestInstance(context, data)
641 , m_program(std::static_pointer_cast<ComputeRandomProgram>(program))
642 , m_subgroupSizeToMaxLoc(std::move(subgroupSizeToMaxLoc))
643 {
644 }
645 ~ReconvergenceTestComputeInstance(void) = default;
646
647 virtual tcu::TestStatus iterate(void) override;
648 qpTestResult_e calculateAndLogResult(const tcu::UVec4 *result, const std::vector<tcu::UVec4> &ref,
649 uint32_t invocationStride, uint32_t subgroupSize, uint32_t shaderMaxLoc);
650
651 private:
652 std::shared_ptr<ComputeRandomProgram> m_program;
653 std::map<uint32_t, uint32_t> m_subgroupSizeToMaxLoc;
654 };
655
656 class ReconvergenceTestGraphicsInstance : public ReconvergenceTestInstance
657 {
658 public:
ReconvergenceTestGraphicsInstance(Context & context,const CaseDef & data)659 ReconvergenceTestGraphicsInstance(Context &context, const CaseDef &data) : ReconvergenceTestInstance(context, data)
660 {
661 }
662 ~ReconvergenceTestGraphicsInstance(void) = default;
663
664 auto makeRenderPassBeginInfo(const VkRenderPass renderPass, const VkFramebuffer framebuffer)
665 -> VkRenderPassBeginInfo;
666 virtual auto recordDrawingAndSubmit(const VkCommandBuffer cmdBuffer, const VkPipelineLayout pipelineLayout,
667 const VkPipeline pipeline, const VkDescriptorSet descriptorSet,
668 const PushConstant &pushConstant, const VkRenderPassBeginInfo &renderPassInfo,
669 const VkBuffer vertexBuffer, const uint32_t vertexCount, const VkImage image)
670 -> void;
671 virtual auto generateVertices(const uint32_t primitiveCount, const VkPrimitiveTopology topology,
672 const uint32_t patchSize = 1) -> std::vector<tcu::Vec4>;
673 virtual auto createVertexBufferAndFlush(const std::vector<tcu::Vec4> &vertices) -> de::MovePtr<BufferWithMemory>;
674 virtual auto createVertexBufferAndFlush(uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
675 -> de::MovePtr<BufferWithMemory>;
676 virtual auto createShaders(void) -> Shaders = 0;
677
678 enum PrintMode
679 {
680 None,
681 ThreadsInColumns,
682 OutLocsInColumns,
683 IntuitiveThreadsOutlocs,
684 Console
685 };
686
687 virtual auto calculateAndLogResult(const uint64_t *result, const std::vector<uint64_t> &ref,
688 uint32_t invocationStride, uint32_t subgroupSize, uint32_t shaderMaxLocs,
689 uint32_t primitiveCount, PrintMode printMode) -> qpTestResult_e;
690 };
691
692 class ReconvergenceTestFragmentInstance : public ReconvergenceTestGraphicsInstance
693 {
694 struct Arrangement
695 {
696 };
697 friend class FragmentRandomProgram;
698
699 public:
ReconvergenceTestFragmentInstance(Context & context,const CaseDef & data)700 ReconvergenceTestFragmentInstance(Context &context, const CaseDef &data)
701 : ReconvergenceTestGraphicsInstance(context, data)
702 {
703 }
704 ~ReconvergenceTestFragmentInstance(void) = default;
705 virtual auto createShaders(void) -> std::vector<Move<VkShaderModule>> override;
706 auto callAuxiliaryShader(tcu::TestStatus &status, uint32_t triangleCount) -> std::vector<uint32_t>;
707 auto makeImageCreateInfo(VkFormat format) const -> VkImageCreateInfo;
708 virtual auto createVertexBufferAndFlush(uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
709 -> de::MovePtr<BufferWithMemory> override;
710 virtual auto iterate(void) -> tcu::TestStatus override;
711 auto calculateAndLogResultEx(tcu::TestLog &log, const tcu::UVec4 *result, const std::vector<tcu::UVec4> &ref,
712 const uint32_t maxLoc, const Arrangement &a, const PrintMode printMode)
713 -> qpTestResult_e;
714 };
715
716 class ReconvergenceTestVertexInstance : public ReconvergenceTestGraphicsInstance
717 {
718 public:
ReconvergenceTestVertexInstance(Context & context,const CaseDef & data)719 ReconvergenceTestVertexInstance(Context &context, const CaseDef &data)
720 : ReconvergenceTestGraphicsInstance(context, data)
721 {
722 }
723 ~ReconvergenceTestVertexInstance(void) = default;
724 virtual auto createShaders(void) -> std::vector<Move<VkShaderModule>> override;
725 virtual auto createVertexBufferAndFlush(uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
726 -> de::MovePtr<BufferWithMemory> override;
727
728 virtual auto iterate(void) -> tcu::TestStatus override;
729 auto calculateAndLogResultEx(add_ref<tcu::TestLog> log, const tcu::UVec4 *result,
730 const std::vector<tcu::UVec4> &ref, const uint32_t maxLoc, const PrintMode printMode)
731 -> qpTestResult_e;
732 };
733
734 class ReconvergenceTestTessCtrlInstance : public ReconvergenceTestGraphicsInstance
735 {
736 public:
ReconvergenceTestTessCtrlInstance(Context & context,const CaseDef & data)737 ReconvergenceTestTessCtrlInstance(Context &context, const CaseDef &data)
738 : ReconvergenceTestGraphicsInstance(context, data)
739 {
740 }
741 ~ReconvergenceTestTessCtrlInstance(void) = default;
742 virtual auto createShaders(void) -> std::vector<Move<VkShaderModule>> override;
743 virtual auto iterate(void) -> tcu::TestStatus override;
744 };
745
746 class ReconvergenceTestTessEvalInstance : public ReconvergenceTestGraphicsInstance
747 {
748 public:
ReconvergenceTestTessEvalInstance(Context & context,add_cref<CaseDef> data)749 ReconvergenceTestTessEvalInstance(Context &context, add_cref<CaseDef> data)
750 : ReconvergenceTestGraphicsInstance(context, data)
751 {
752 }
753 ~ReconvergenceTestTessEvalInstance(void) = default;
754 virtual auto createShaders(void) -> std::vector<Move<VkShaderModule>> override;
755 virtual auto iterate(void) -> tcu::TestStatus override;
756 };
757
758 class ReconvergenceTestGeometryInstance : public ReconvergenceTestGraphicsInstance
759 {
760 public:
ReconvergenceTestGeometryInstance(Context & context,add_cref<CaseDef> data)761 ReconvergenceTestGeometryInstance(Context &context, add_cref<CaseDef> data)
762 : ReconvergenceTestGraphicsInstance(context, data)
763 {
764 }
765 ~ReconvergenceTestGeometryInstance(void) = default;
766 virtual auto createShaders(void) -> std::vector<Move<VkShaderModule>> override;
767 virtual auto createVertexBufferAndFlush(uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
768 -> de::MovePtr<BufferWithMemory> override;
769
770 virtual auto iterate(void) -> tcu::TestStatus override;
771 auto calculateAndLogResultEx(add_ref<tcu::TestLog> log, const tcu::UVec4 *result,
772 const std::vector<tcu::UVec4> &ref, const uint32_t maxLoc, const PrintMode printMode)
773 -> qpTestResult_e;
774 };
775
createGraphicsPipeline(const VkPipelineLayout pipelineLayout,const VkRenderPass renderPass,const uint32_t width,const uint32_t height,const Shaders & shaders,const VkPrimitiveTopology topology,const uint32_t patchControlPoints)776 Move<VkPipeline> ReconvergenceTestInstance::createGraphicsPipeline(const VkPipelineLayout pipelineLayout,
777 const VkRenderPass renderPass, const uint32_t width,
778 const uint32_t height, const Shaders &shaders,
779 const VkPrimitiveTopology topology,
780 const uint32_t patchControlPoints)
781 {
782 const DeviceInterface &vkd = m_context.getDeviceInterface();
783 const VkDevice device = m_context.getDevice();
784 const uint32_t subpass = 0;
785
786 const std::vector<VkViewport> viewports{makeViewport(width, height)};
787 const std::vector<VkRect2D> scissors{makeRect2D(width, height)};
788
789 enum ShaderIndex
790 {
791 IVERT = 0,
792 IFRAG,
793 ITESC,
794 ITESE,
795 IGEOM
796 };
797 VkShaderModule handles[5] = {VK_NULL_HANDLE, VK_NULL_HANDLE, VK_NULL_HANDLE, VK_NULL_HANDLE,
798 VK_NULL_HANDLE}; // { vert, frag, tesc, tese, geom }
799
800 for (uint32_t i = 0; i < (uint32_t)ARRAYSIZE(handles); ++i)
801 {
802 handles[i] = (i < (uint32_t)shaders.size()) ? *shaders[i] : VK_NULL_HANDLE;
803 }
804
805 return makeGraphicsPipeline(vkd, device, pipelineLayout, handles[IVERT], handles[ITESC], handles[ITESE],
806 handles[IGEOM], handles[IFRAG], renderPass, viewports, scissors, topology, subpass,
807 patchControlPoints);
808 }
809
createComputePipeline(const VkPipelineLayout pipelineLayout,const VkShaderModule computeShader)810 Move<VkPipeline> ReconvergenceTestInstance::createComputePipeline(const VkPipelineLayout pipelineLayout,
811 const VkShaderModule computeShader)
812 {
813 const DeviceInterface &vk = m_context.getDeviceInterface();
814 const VkDevice device = m_context.getDevice();
815
816 const uint32_t specData[2] = {m_data.sizeX, m_data.sizeY};
817 const vk::VkSpecializationMapEntry entries[DE_LENGTH_OF_ARRAY(specData)] = {
818 {0, (uint32_t)(sizeof(uint32_t) * 0), sizeof(uint32_t)},
819 {1, (uint32_t)(sizeof(uint32_t) * 1), sizeof(uint32_t)},
820 };
821 const vk::VkSpecializationInfo specInfo = {
822 DE_LENGTH_OF_ARRAY(entries), // mapEntryCount
823 entries, // pMapEntries
824 sizeof(specData), // dataSize
825 specData // pData
826 };
827
828 const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT subgroupSizeCreateInfo = {
829 VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT, // VkStructureType sType;
830 nullptr, // void* pNext;
831 m_subgroupSize // uint32_t requiredSubgroupSize;
832 };
833
834 const VkBool32 computeFullSubgroups =
835 m_subgroupSize <= 64 && m_context.getSubgroupSizeControlFeatures().computeFullSubgroups;
836
837 const void *shaderPNext = computeFullSubgroups ? &subgroupSizeCreateInfo : nullptr;
838 VkPipelineShaderStageCreateFlags pipelineShaderStageCreateFlags =
839 (VkPipelineShaderStageCreateFlags)(computeFullSubgroups ?
840 VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT :
841 0);
842
843 const VkPipelineShaderStageCreateInfo shaderCreateInfo = {
844 VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
845 shaderPNext,
846 pipelineShaderStageCreateFlags,
847 VK_SHADER_STAGE_COMPUTE_BIT, // stage
848 computeShader, // shader
849 "main",
850 &specInfo, // pSpecializationInfo
851 };
852
853 const VkComputePipelineCreateInfo pipelineCreateInfo = {
854 VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
855 nullptr,
856 0u, // flags
857 shaderCreateInfo, // cs
858 pipelineLayout, // layout
859 VK_NULL_HANDLE, // basePipelineHandle
860 0u, // basePipelineIndex
861 };
862
863 return vk::createComputePipeline(vk, device, VK_NULL_HANDLE, &pipelineCreateInfo, NULL);
864 }
865
866 typedef enum
867 {
868 // store subgroupBallot().
869 // For OP_BALLOT, OP::caseValue is initialized to zero, and then
870 // set to 1 by simulate if the ballot is not workgroup- (or subgroup-_uniform.
871 // Only workgroup-uniform ballots are validated for correctness in
872 // WUCF modes.
873 OP_BALLOT,
874
875 // store literal constant
876 OP_STORE,
877
878 // if ((1ULL << gl_SubgroupInvocationID) & mask).
879 // Special case if mask = ~0ULL, converted into "if (inputA.a[idx] == idx)"
880 OP_IF_MASK,
881 OP_ELSE_MASK,
882 OP_ENDIF,
883
884 // if (gl_SubgroupInvocationID == loopIdxN) (where N is most nested loop counter)
885 OP_IF_LOOPCOUNT,
886 OP_ELSE_LOOPCOUNT,
887
888 // if (gl_LocalInvocationIndex >= inputA.a[N]) (where N is most nested loop counter)
889 OP_IF_LOCAL_INVOCATION_INDEX,
890 OP_ELSE_LOCAL_INVOCATION_INDEX,
891
892 // break/continue
893 OP_BREAK,
894 OP_CONTINUE,
895
896 // if (subgroupElect())
897 OP_ELECT,
898
899 // Loop with uniform number of iterations (read from a buffer)
900 OP_BEGIN_FOR_UNIF,
901 OP_END_FOR_UNIF,
902
903 // for (int loopIdxN = 0; loopIdxN < gl_SubgroupInvocationID + 1; ++loopIdxN)
904 OP_BEGIN_FOR_VAR,
905 OP_END_FOR_VAR,
906
907 // for (int loopIdxN = 0;; ++loopIdxN, OP_BALLOT)
908 // Always has an "if (subgroupElect()) break;" inside.
909 // Does the equivalent of OP_BALLOT in the continue construct
910 OP_BEGIN_FOR_INF,
911 OP_END_FOR_INF,
912
913 // do { loopIdxN++; ... } while (loopIdxN < uniformValue);
914 OP_BEGIN_DO_WHILE_UNIF,
915 OP_END_DO_WHILE_UNIF,
916
917 // do { ... } while (true);
918 // Always has an "if (subgroupElect()) break;" inside
919 OP_BEGIN_DO_WHILE_INF,
920 OP_END_DO_WHILE_INF,
921
922 // return;
923 OP_RETURN,
924
925 // function call (code bracketed by these is extracted into a separate function)
926 OP_CALL_BEGIN,
927 OP_CALL_END,
928
929 // switch statement on uniform value
930 OP_SWITCH_UNIF_BEGIN,
931 // switch statement on gl_SubgroupInvocationID & 3 value
932 OP_SWITCH_VAR_BEGIN,
933 // switch statement on loopIdx value
934 OP_SWITCH_LOOP_COUNT_BEGIN,
935
936 // case statement with a (invocation mask, case mask) pair
937 OP_CASE_MASK_BEGIN,
938 // case statement used for loop counter switches, with a value and a mask of loop iterations
939 OP_CASE_LOOP_COUNT_BEGIN,
940
941 // end of switch/case statement
942 OP_SWITCH_END,
943 OP_CASE_END,
944
945 // Extra code with no functional effect. Currently inculdes:
946 // - value 0: while (!subgroupElect()) {}
947 // - value 1: if (condition_that_is_false) { infinite loop }
948 OP_NOISE,
949
950 // do nothing, only markup
951 OP_NOP
952 } OPType;
953
OPtypeToStr(const OPType op)954 const char *OPtypeToStr(const OPType op)
955 {
956 #define MAKETEXT(s__) #s__
957 #define CASETEXT(e__) \
958 case e__: \
959 return MAKETEXT(e__)
960 switch (op)
961 {
962 CASETEXT(OP_BALLOT);
963 CASETEXT(OP_STORE);
964 CASETEXT(OP_IF_MASK);
965 CASETEXT(OP_ELSE_MASK);
966 CASETEXT(OP_ENDIF);
967 CASETEXT(OP_IF_LOOPCOUNT);
968 CASETEXT(OP_ELSE_LOOPCOUNT);
969 CASETEXT(OP_IF_LOCAL_INVOCATION_INDEX);
970 CASETEXT(OP_ELSE_LOCAL_INVOCATION_INDEX);
971 CASETEXT(OP_BREAK);
972 CASETEXT(OP_CONTINUE);
973 CASETEXT(OP_ELECT);
974 CASETEXT(OP_BEGIN_FOR_UNIF);
975 CASETEXT(OP_END_FOR_UNIF);
976 CASETEXT(OP_BEGIN_FOR_VAR);
977 CASETEXT(OP_END_FOR_VAR);
978 CASETEXT(OP_BEGIN_FOR_INF);
979 CASETEXT(OP_END_FOR_INF);
980 CASETEXT(OP_BEGIN_DO_WHILE_UNIF);
981 CASETEXT(OP_END_DO_WHILE_UNIF);
982 CASETEXT(OP_BEGIN_DO_WHILE_INF);
983 CASETEXT(OP_END_DO_WHILE_INF);
984 CASETEXT(OP_RETURN);
985 CASETEXT(OP_CALL_BEGIN);
986 CASETEXT(OP_CALL_END);
987 CASETEXT(OP_SWITCH_UNIF_BEGIN);
988 CASETEXT(OP_SWITCH_VAR_BEGIN);
989 CASETEXT(OP_SWITCH_LOOP_COUNT_BEGIN);
990 CASETEXT(OP_CASE_MASK_BEGIN);
991 CASETEXT(OP_CASE_LOOP_COUNT_BEGIN);
992 CASETEXT(OP_SWITCH_END);
993 CASETEXT(OP_CASE_END);
994 CASETEXT(OP_NOISE);
995 CASETEXT(OP_NOP);
996 }
997 return "<Unknown>";
998 }
999
1000 typedef enum
1001 {
1002 // Different if test conditions
1003 IF_MASK,
1004 IF_UNIFORM,
1005 IF_LOOPCOUNT,
1006 IF_LOCAL_INVOCATION_INDEX,
1007 } IFType;
1008
1009 class OP
1010 {
1011 public:
OP(OPType _type,uint64_t _value,uint32_t _caseValue=0)1012 OP(OPType _type, uint64_t _value, uint32_t _caseValue = 0)
1013 : type(_type)
1014 , value(_value)
1015 // by default, initialized only lower part with a repetition of _value
1016 , bvalue(tcu::UVec4(uint32_t(_value), uint32_t(_value >> 32), uint32_t(_value), uint32_t(_value >> 32)))
1017 , caseValue(_caseValue)
1018 {
1019 }
1020
1021 // The type of operation and an optional value.
1022 // The value could be a mask for an if test, the index of the loop
1023 // header for an end of loop, or the constant value for a store instruction
1024 OPType type;
1025 uint64_t value;
1026 Ballot bvalue;
1027 uint32_t caseValue;
1028 };
1029
1030 class RandomProgram
1031 {
1032
1033 public:
RandomProgram(const CaseDef & c,uint32_t invocationCount=0u)1034 RandomProgram(const CaseDef &c, uint32_t invocationCount = 0u)
1035 : caseDef(c)
1036 , invocationStride(invocationCount ? invocationCount : (c.sizeX * c.sizeY))
1037 , rnd()
1038 , ops()
1039 , masks()
1040 , ballotMasks()
1041 , numMasks(5)
1042 , nesting(0)
1043 , maxNesting(c.maxNesting)
1044 , loopNesting(0)
1045 , loopNestingThisFunction(0)
1046 , callNesting(0)
1047 , minCount(30)
1048 , indent(0)
1049 , isLoopInf(100, false)
1050 , doneInfLoopBreak(100, false)
1051 , storeBase(0x10000)
1052 {
1053 deRandom_init(&rnd, caseDef.seed);
1054 for (int i = 0; i < numMasks; ++i)
1055 {
1056 const uint64_t lo = deRandom_getUint64(&rnd);
1057 const uint64_t hi = deRandom_getUint64(&rnd);
1058 const tcu::UVec4 v4(uint32_t(lo), uint32_t(lo >> 32), uint32_t(hi), uint32_t(hi >> 32));
1059 ballotMasks.emplace_back(v4);
1060 masks.push_back(lo);
1061 }
1062 }
1063 virtual ~RandomProgram() = default;
1064
1065 const CaseDef caseDef;
1066 const uint32_t invocationStride;
1067 deRandom rnd;
1068 vector<OP> ops;
1069 vector<uint64_t> masks;
1070 vector<Ballot> ballotMasks;
1071 int32_t numMasks;
1072 int32_t nesting;
1073 int32_t maxNesting;
1074 int32_t loopNesting;
1075 int32_t loopNestingThisFunction;
1076 int32_t callNesting;
1077 int32_t minCount;
1078 int32_t indent;
1079 vector<bool> isLoopInf;
1080 vector<bool> doneInfLoopBreak;
1081 // Offset the value we use for OP_STORE, to avoid colliding with fully converged
1082 // active masks with small subgroup sizes (e.g. with subgroupSize == 4, the SUCF
1083 // tests need to know that 0xF is really an active mask).
1084 int32_t storeBase;
1085
genIf(IFType ifType,uint32_t maxLocalIndexCmp=0u)1086 virtual void genIf(IFType ifType, uint32_t maxLocalIndexCmp = 0u)
1087 {
1088 uint32_t maskIdx = deRandom_getUint32(&rnd) % numMasks;
1089 uint64_t mask = masks[maskIdx];
1090 Ballot bmask = ballotMasks[maskIdx];
1091 if (ifType == IF_UNIFORM)
1092 {
1093 mask = ~0ULL;
1094 bmask.set();
1095 }
1096
1097 uint32_t localIndexCmp = deRandom_getUint32(&rnd) % (maxLocalIndexCmp ? maxLocalIndexCmp : invocationStride);
1098 if (ifType == IF_LOCAL_INVOCATION_INDEX)
1099 ops.push_back({OP_IF_LOCAL_INVOCATION_INDEX, localIndexCmp});
1100 else if (ifType == IF_LOOPCOUNT)
1101 ops.push_back({OP_IF_LOOPCOUNT, 0});
1102 else
1103 {
1104 ops.push_back({OP_IF_MASK, mask});
1105 ops.back().bvalue = bmask;
1106 }
1107
1108 nesting++;
1109
1110 size_t thenBegin = ops.size();
1111 pickOP(2);
1112 size_t thenEnd = ops.size();
1113
1114 uint32_t randElse = (deRandom_getUint32(&rnd) % 100);
1115 if (randElse < 50)
1116 {
1117 if (ifType == IF_LOCAL_INVOCATION_INDEX)
1118 ops.push_back({OP_ELSE_LOCAL_INVOCATION_INDEX, localIndexCmp});
1119 else if (ifType == IF_LOOPCOUNT)
1120 ops.push_back({OP_ELSE_LOOPCOUNT, 0});
1121 else
1122 ops.push_back({OP_ELSE_MASK, 0});
1123
1124 if (randElse < 10)
1125 {
1126 // Sometimes make the else block identical to the then block
1127 for (size_t i = thenBegin; i < thenEnd; ++i)
1128 ops.push_back(ops[i]);
1129 }
1130 else
1131 pickOP(2);
1132 }
1133 ops.push_back({OP_ENDIF, 0});
1134 nesting--;
1135 }
1136
genForUnif()1137 void genForUnif()
1138 {
1139 uint32_t iterCount = (deRandom_getUint32(&rnd) % 5) + 1;
1140 ops.push_back({OP_BEGIN_FOR_UNIF, iterCount});
1141 uint32_t loopheader = (uint32_t)ops.size() - 1;
1142 nesting++;
1143 loopNesting++;
1144 loopNestingThisFunction++;
1145 pickOP(2);
1146 ops.push_back({OP_END_FOR_UNIF, loopheader});
1147 loopNestingThisFunction--;
1148 loopNesting--;
1149 nesting--;
1150 }
1151
genDoWhileUnif()1152 void genDoWhileUnif()
1153 {
1154 uint32_t iterCount = (deRandom_getUint32(&rnd) % 5) + 1;
1155 ops.push_back({OP_BEGIN_DO_WHILE_UNIF, iterCount});
1156 uint32_t loopheader = (uint32_t)ops.size() - 1;
1157 nesting++;
1158 loopNesting++;
1159 loopNestingThisFunction++;
1160 pickOP(2);
1161 ops.push_back({OP_END_DO_WHILE_UNIF, loopheader});
1162 loopNestingThisFunction--;
1163 loopNesting--;
1164 nesting--;
1165 }
1166
genForVar()1167 void genForVar()
1168 {
1169 ops.push_back({OP_BEGIN_FOR_VAR, 0});
1170 uint32_t loopheader = (uint32_t)ops.size() - 1;
1171 nesting++;
1172 loopNesting++;
1173 loopNestingThisFunction++;
1174 pickOP(2);
1175 ops.push_back({OP_END_FOR_VAR, loopheader});
1176 loopNestingThisFunction--;
1177 loopNesting--;
1178 nesting--;
1179 }
1180
genForInf()1181 void genForInf()
1182 {
1183 ops.push_back({OP_BEGIN_FOR_INF, 0});
1184 uint32_t loopheader = (uint32_t)ops.size() - 1;
1185
1186 nesting++;
1187 loopNesting++;
1188 loopNestingThisFunction++;
1189 isLoopInf[loopNesting] = true;
1190 doneInfLoopBreak[loopNesting] = false;
1191
1192 pickOP(2);
1193
1194 genElect(true);
1195 doneInfLoopBreak[loopNesting] = true;
1196
1197 pickOP(2);
1198
1199 ops.push_back({OP_END_FOR_INF, loopheader});
1200
1201 isLoopInf[loopNesting] = false;
1202 doneInfLoopBreak[loopNesting] = false;
1203 loopNestingThisFunction--;
1204 loopNesting--;
1205 nesting--;
1206 }
1207
genDoWhileInf()1208 void genDoWhileInf()
1209 {
1210 ops.push_back({OP_BEGIN_DO_WHILE_INF, 0});
1211 uint32_t loopheader = (uint32_t)ops.size() - 1;
1212
1213 nesting++;
1214 loopNesting++;
1215 loopNestingThisFunction++;
1216 isLoopInf[loopNesting] = true;
1217 doneInfLoopBreak[loopNesting] = false;
1218
1219 pickOP(2);
1220
1221 genElect(true);
1222 doneInfLoopBreak[loopNesting] = true;
1223
1224 pickOP(2);
1225
1226 ops.push_back({OP_END_DO_WHILE_INF, loopheader});
1227
1228 isLoopInf[loopNesting] = false;
1229 doneInfLoopBreak[loopNesting] = false;
1230 loopNestingThisFunction--;
1231 loopNesting--;
1232 nesting--;
1233 }
1234
genBreak()1235 void genBreak()
1236 {
1237 if (loopNestingThisFunction > 0)
1238 {
1239 // Sometimes put the break in a divergent if
1240 if ((deRandom_getUint32(&rnd) % 100) < 10)
1241 {
1242 ops.push_back({OP_IF_MASK, masks[0]});
1243 ops.back().bvalue = ballotMasks[0];
1244 ops.push_back({OP_BREAK, 0});
1245 ops.push_back({OP_ELSE_MASK, 0});
1246 ops.push_back({OP_BREAK, 0});
1247 ops.push_back({OP_ENDIF, 0});
1248 }
1249 else
1250 ops.push_back({OP_BREAK, 0});
1251 }
1252 }
1253
genContinue()1254 void genContinue()
1255 {
1256 // continues are allowed if we're in a loop and the loop is not infinite,
1257 // or if it is infinite and we've already done a subgroupElect+break.
1258 // However, adding more continues seems to reduce the failure rate, so
1259 // disable it for now
1260 if (loopNestingThisFunction > 0 && !(isLoopInf[loopNesting] /*&& !doneInfLoopBreak[loopNesting]*/))
1261 {
1262 // Sometimes put the continue in a divergent if
1263 if ((deRandom_getUint32(&rnd) % 100) < 10)
1264 {
1265 ops.push_back({OP_IF_MASK, masks[0]});
1266 ops.back().bvalue = ballotMasks[0];
1267 ops.push_back({OP_CONTINUE, 0});
1268 ops.push_back({OP_ELSE_MASK, 0});
1269 ops.push_back({OP_CONTINUE, 0});
1270 ops.push_back({OP_ENDIF, 0});
1271 }
1272 else
1273 ops.push_back({OP_CONTINUE, 0});
1274 }
1275 }
1276
1277 // doBreak is used to generate "if (subgroupElect()) { ... break; }" inside infinite loops
genElect(bool doBreak)1278 void genElect(bool doBreak)
1279 {
1280 ops.push_back({OP_ELECT, 0});
1281 nesting++;
1282 if (doBreak)
1283 {
1284 // Put something interestign before the break
1285 genBallot();
1286 genBallot();
1287 if ((deRandom_getUint32(&rnd) % 100) < 10)
1288 pickOP(1);
1289
1290 // if we're in a function, sometimes use return instead
1291 if (callNesting > 0 && (deRandom_getUint32(&rnd) % 100) < 30)
1292 ops.push_back({OP_RETURN, 0});
1293 else
1294 genBreak();
1295 }
1296 else
1297 pickOP(2);
1298
1299 ops.push_back({OP_ENDIF, 0});
1300 nesting--;
1301 }
1302
genReturn()1303 void genReturn()
1304 {
1305 uint32_t r = deRandom_getUint32(&rnd) % 100;
1306 if (nesting > 0 &&
1307 // Use return rarely in main, 20% of the time in a singly nested loop in a function
1308 // and 50% of the time in a multiply nested loop in a function
1309 (r < 5 || (callNesting > 0 && loopNestingThisFunction > 0 && r < 20) ||
1310 (callNesting > 0 && loopNestingThisFunction > 1 && r < 50)))
1311 {
1312 genBallot();
1313 if ((deRandom_getUint32(&rnd) % 100) < 10)
1314 {
1315 ops.push_back({OP_IF_MASK, masks[0]});
1316 ops.back().bvalue = ballotMasks[0];
1317 ops.push_back({OP_RETURN, 0});
1318 ops.push_back({OP_ELSE_MASK, 0});
1319 ops.push_back({OP_RETURN, 0});
1320 ops.push_back({OP_ENDIF, 0});
1321 }
1322 else
1323 ops.push_back({OP_RETURN, 0});
1324 }
1325 }
1326
1327 // Generate a function call. Save and restore some loop information, which is used to
1328 // determine when it's safe to use break/continue
genCall()1329 void genCall()
1330 {
1331 ops.push_back({OP_CALL_BEGIN, 0});
1332 callNesting++;
1333 nesting++;
1334 int32_t saveLoopNestingThisFunction = loopNestingThisFunction;
1335 loopNestingThisFunction = 0;
1336
1337 pickOP(2);
1338
1339 loopNestingThisFunction = saveLoopNestingThisFunction;
1340 nesting--;
1341 callNesting--;
1342 ops.push_back({OP_CALL_END, 0});
1343 }
1344
1345 // Generate switch on a uniform value:
1346 // switch (inputA.a[r]) {
1347 // case r+1: ... break; // should not execute
1348 // case r: ... break; // should branch uniformly
1349 // case r+2: ... break; // should not execute
1350 // }
genSwitchUnif()1351 void genSwitchUnif()
1352 {
1353 uint32_t r = deRandom_getUint32(&rnd) % 5;
1354 ops.push_back({OP_SWITCH_UNIF_BEGIN, r});
1355 nesting++;
1356
1357 ops.push_back({OP_CASE_MASK_BEGIN, 0, 1u << (r + 1)});
1358 pickOP(1);
1359 ops.push_back({OP_CASE_END, 0});
1360
1361 ops.push_back({OP_CASE_MASK_BEGIN, ~0ULL, 1u << r});
1362 ops.back().bvalue.set();
1363 pickOP(2);
1364 ops.push_back({OP_CASE_END, 0});
1365
1366 ops.push_back({OP_CASE_MASK_BEGIN, 0, 1u << (r + 2)});
1367 pickOP(1);
1368 ops.push_back({OP_CASE_END, 0});
1369
1370 ops.push_back({OP_SWITCH_END, 0});
1371 nesting--;
1372 }
1373
1374 // switch (gl_SubgroupInvocationID & 3) with four unique targets
genSwitchVar()1375 void genSwitchVar()
1376 {
1377 ops.push_back({OP_SWITCH_VAR_BEGIN, 0});
1378 nesting++;
1379
1380 ops.push_back({OP_CASE_MASK_BEGIN, 0x1111111111111111ULL, 1 << 0});
1381 ops.back().bvalue = tcu::UVec4(0x11111111);
1382 pickOP(1);
1383 ops.push_back({OP_CASE_END, 0});
1384
1385 ops.push_back({OP_CASE_MASK_BEGIN, 0x2222222222222222ULL, 1 << 1});
1386 ops.back().bvalue = tcu::UVec4(0x22222222);
1387 pickOP(1);
1388 ops.push_back({OP_CASE_END, 0});
1389
1390 ops.push_back({OP_CASE_MASK_BEGIN, 0x4444444444444444ULL, 1 << 2});
1391 ops.back().bvalue = tcu::UVec4(0x44444444);
1392 pickOP(1);
1393 ops.push_back({OP_CASE_END, 0});
1394
1395 ops.push_back({OP_CASE_MASK_BEGIN, 0x8888888888888888ULL, 1 << 3});
1396 ops.back().bvalue = tcu::UVec4(0x88888888);
1397 pickOP(1);
1398 ops.push_back({OP_CASE_END, 0});
1399
1400 ops.push_back({OP_SWITCH_END, 0});
1401 nesting--;
1402 }
1403
1404 // switch (gl_SubgroupInvocationID & 3) with two shared targets.
1405 // XXX TODO: The test considers these two targets to remain converged,
1406 // though we haven't agreed to that behavior yet.
genSwitchMulticase()1407 void genSwitchMulticase()
1408 {
1409 ops.push_back({OP_SWITCH_VAR_BEGIN, 0});
1410 nesting++;
1411
1412 ops.push_back({OP_CASE_MASK_BEGIN, 0x3333333333333333ULL, (1 << 0) | (1 << 1)});
1413 ops.back().bvalue = tcu::UVec4(0x33333333);
1414 pickOP(2);
1415 ops.push_back({OP_CASE_END, 0});
1416
1417 ops.push_back({OP_CASE_MASK_BEGIN, 0xCCCCCCCCCCCCCCCCULL, (1 << 2) | (1 << 3)});
1418 ops.back().bvalue = tcu::UVec4(0xCCCCCCCC);
1419 pickOP(2);
1420 ops.push_back({OP_CASE_END, 0});
1421
1422 ops.push_back({OP_SWITCH_END, 0});
1423 nesting--;
1424 }
1425
1426 // switch (loopIdxN) {
1427 // case 1: ... break;
1428 // case 2: ... break;
1429 // default: ... break;
1430 // }
genSwitchLoopCount()1431 void genSwitchLoopCount()
1432 {
1433 uint32_t r = deRandom_getUint32(&rnd) % loopNesting;
1434 ops.push_back({OP_SWITCH_LOOP_COUNT_BEGIN, r});
1435 nesting++;
1436
1437 ops.push_back({OP_CASE_LOOP_COUNT_BEGIN, 1ULL << 1, 1});
1438 ops.back().bvalue = tcu::UVec4(1 << 1, 0, 0, 0);
1439 pickOP(1);
1440 ops.push_back({OP_CASE_END, 0});
1441
1442 ops.push_back({OP_CASE_LOOP_COUNT_BEGIN, 1ULL << 2, 2});
1443 ops.back().bvalue = tcu::UVec4(1 << 2, 0, 0, 0);
1444 pickOP(1);
1445 ops.push_back({OP_CASE_END, 0});
1446
1447 // default:
1448 ops.push_back({OP_CASE_LOOP_COUNT_BEGIN, ~6ULL, 0xFFFFFFFF});
1449 ops.back().bvalue = tcu::UVec4(~6u, ~0u, ~0u, ~0u);
1450 pickOP(1);
1451 ops.push_back({OP_CASE_END, 0});
1452
1453 ops.push_back({OP_SWITCH_END, 0});
1454 nesting--;
1455 }
1456
pickOP(uint32_t count)1457 void pickOP(uint32_t count)
1458 {
1459 // Pick "count" instructions. These can recursively insert more instructions,
1460 // so "count" is just a seed
1461 for (uint32_t i = 0; i < count; ++i)
1462 {
1463 genBallot();
1464 if (nesting < maxNesting)
1465 {
1466 uint32_t r = deRandom_getUint32(&rnd) % 11;
1467 switch (r)
1468 {
1469 default:
1470 DE_ASSERT(0);
1471 // fallthrough
1472 case 2:
1473 if (loopNesting)
1474 {
1475 genIf(IF_LOOPCOUNT);
1476 break;
1477 }
1478 // fallthrough
1479 case 10:
1480 genIf(IF_LOCAL_INVOCATION_INDEX);
1481 break;
1482 case 0:
1483 genIf(IF_MASK);
1484 break;
1485 case 1:
1486 genIf(IF_UNIFORM);
1487 break;
1488 case 3:
1489 {
1490 // don't nest loops too deeply, to avoid extreme memory usage or timeouts
1491 if (loopNesting <= 3)
1492 {
1493 uint32_t r2 = deRandom_getUint32(&rnd) % 3;
1494 switch (r2)
1495 {
1496 default:
1497 DE_ASSERT(0); // fallthrough
1498 case 0:
1499 genForUnif();
1500 break;
1501 case 1:
1502 genForInf();
1503 break;
1504 case 2:
1505 genForVar();
1506 break;
1507 }
1508 }
1509 }
1510 break;
1511 case 4:
1512 genBreak();
1513 break;
1514 case 5:
1515 genContinue();
1516 break;
1517 case 6:
1518 genElect(false);
1519 break;
1520 case 7:
1521 {
1522 uint32_t r2 = deRandom_getUint32(&rnd) % 5;
1523 if (r2 == 0 && callNesting == 0 && nesting < maxNesting - 2)
1524 genCall();
1525 else
1526 genReturn();
1527 break;
1528 }
1529 case 8:
1530 {
1531 // don't nest loops too deeply, to avoid extreme memory usage or timeouts
1532 if (loopNesting <= 3)
1533 {
1534 uint32_t r2 = deRandom_getUint32(&rnd) % 2;
1535 switch (r2)
1536 {
1537 default:
1538 DE_ASSERT(0); // fallthrough
1539 case 0:
1540 genDoWhileUnif();
1541 break;
1542 case 1:
1543 genDoWhileInf();
1544 break;
1545 }
1546 }
1547 }
1548 break;
1549 case 9:
1550 {
1551 uint32_t r2 = deRandom_getUint32(&rnd) % 4;
1552 switch (r2)
1553 {
1554 default:
1555 DE_ASSERT(0);
1556 // fallthrough
1557 case 0:
1558 genSwitchUnif();
1559 break;
1560 case 1:
1561 if (loopNesting > 0)
1562 {
1563 genSwitchLoopCount();
1564 break;
1565 }
1566 // fallthrough
1567 case 2:
1568 if (caseDef.testType != TT_MAXIMAL)
1569 {
1570 // multicase doesn't have fully-defined behavior for MAXIMAL tests,
1571 // but does for SUCF tests
1572 genSwitchMulticase();
1573 break;
1574 }
1575 // fallthrough
1576 case 3:
1577 genSwitchVar();
1578 break;
1579 }
1580 }
1581 break;
1582 }
1583 }
1584 genBallot();
1585 }
1586 }
1587
genBallot()1588 void genBallot()
1589 {
1590 // optionally insert ballots, stores, and noise. Ballots and stores are used to determine
1591 // correctness.
1592 if ((deRandom_getUint32(&rnd) % 100) < 20)
1593 {
1594 if (ops.size() < 2 || !(ops[ops.size() - 1].type == OP_BALLOT ||
1595 (ops[ops.size() - 1].type == OP_STORE && ops[ops.size() - 2].type == OP_BALLOT)))
1596 {
1597 // do a store along with each ballot, so we can correlate where
1598 // the ballot came from
1599 if (caseDef.testType != TT_MAXIMAL)
1600 ops.push_back({OP_STORE, (uint32_t)ops.size() + storeBase});
1601 ops.push_back({OP_BALLOT, 0});
1602 }
1603 }
1604
1605 if ((deRandom_getUint32(&rnd) % 100) < 10)
1606 {
1607 if (ops.size() < 2 || !(ops[ops.size() - 1].type == OP_STORE ||
1608 (ops[ops.size() - 1].type == OP_BALLOT && ops[ops.size() - 2].type == OP_STORE)))
1609 {
1610 // SUCF does a store with every ballot. Don't bloat the code by adding more.
1611 if (caseDef.testType == TT_MAXIMAL)
1612 ops.push_back({OP_STORE, (uint32_t)ops.size() + storeBase});
1613 }
1614 }
1615
1616 uint32_t r = deRandom_getUint32(&rnd) % 10000;
1617 if (r < 3)
1618 ops.push_back({OP_NOISE, 0});
1619 else if (r < 10)
1620 ops.push_back({OP_NOISE, 1});
1621 }
1622
generateRandomProgram(qpWatchDog * watchDog,add_ref<tcu::TestLog> log)1623 std::map<uint32_t, uint32_t> generateRandomProgram(qpWatchDog *watchDog, add_ref<tcu::TestLog> log)
1624 {
1625 std::vector<tcu::UVec4> ref;
1626 std::map<uint32_t, uint32_t> subgroupSizeToMaxLoc;
1627
1628 do
1629 {
1630 ops.clear();
1631 while ((int32_t)ops.size() < minCount)
1632 pickOP(1);
1633
1634 // Retry until the program has some UCF results in it
1635 if (caseDef.isUCF())
1636 {
1637 // Simulate for all subgroup sizes, to determine whether OP_BALLOTs are nonuniform
1638 for (int32_t subgroupSize = 4; subgroupSize <= 128; subgroupSize *= 2)
1639 {
1640 //simulate(true, subgroupSize, ref);
1641 const uint32_t maxLoc = execute(watchDog, true, subgroupSize, 0u, invocationStride, ref, log);
1642 subgroupSizeToMaxLoc[subgroupSize] = maxLoc;
1643 }
1644 }
1645 } while (caseDef.isUCF() && !hasUCF());
1646
1647 return subgroupSizeToMaxLoc;
1648 }
1649
printIndent(std::stringstream & css)1650 void printIndent(std::stringstream &css)
1651 {
1652 for (int32_t i = 0; i < indent; ++i)
1653 css << " ";
1654 }
1655
1656 struct FlowState
1657 {
1658 add_cref<vector<OP>> ops;
1659 const int32_t opsIndex;
1660 const int32_t loopNesting;
1661 const int funcNum;
1662 };
1663
1664 // State of the subgroup at each level of nesting
1665 struct SubgroupState
1666 {
1667 // Currently executing
1668 bitset_inv_t activeMask;
1669 // Have executed a continue instruction in this loop
1670 bitset_inv_t continueMask;
1671 // index of the current if test or loop header
1672 uint32_t header;
1673 // number of loop iterations performed
1674 uint32_t tripCount;
1675 // is this nesting a loop?
1676 uint32_t isLoop;
1677 // is this nesting a function call?
1678 uint32_t isCall;
1679 // is this nesting a switch?
1680 uint32_t isSwitch;
1681 };
1682
1683 struct SubgroupState2
1684 {
1685 // Currently executing
1686 Ballots activeMask;
1687 // Have executed a continue instruction in this loop
1688 Ballots continueMask;
1689 // index of the current if test or loop header
1690 uint32_t header;
1691 // number of loop iterations performed
1692 uint32_t tripCount;
1693 // is this nesting a loop?
1694 uint32_t isLoop;
1695 // is this nesting a function call?
1696 uint32_t isCall;
1697 // is this nesting a switch?
1698 uint32_t isSwitch;
1699 virtual ~SubgroupState2() = default;
SubgroupState2vkt::Reconvergence::__anone030def80111::RandomProgram::SubgroupState21700 SubgroupState2() : SubgroupState2(0)
1701 {
1702 }
SubgroupState2vkt::Reconvergence::__anone030def80111::RandomProgram::SubgroupState21703 SubgroupState2(uint32_t subgroupCount)
1704 : activeMask(subgroupCount)
1705 , continueMask(subgroupCount)
1706 , header()
1707 , tripCount()
1708 , isLoop()
1709 , isCall()
1710 , isSwitch()
1711 {
1712 }
1713 };
1714
1715 struct Prerequisites
1716 {
1717 };
1718
getPartitionBallotText()1719 virtual std::string getPartitionBallotText()
1720 {
1721 return "subgroupBallot(true)";
1722 }
1723
printIfLocalInvocationIndex(std::stringstream & css,add_cref<FlowState> flow)1724 virtual void printIfLocalInvocationIndex(std::stringstream &css, add_cref<FlowState> flow)
1725 {
1726 printIndent(css);
1727 css << "if (gl_LocalInvocationIndex >= inputA.a[0x" << std::hex << flow.ops[flow.opsIndex].value << "]) {\n";
1728 }
1729
printStore(std::stringstream & css,add_cref<FlowState> flow)1730 virtual void printStore(std::stringstream &css, add_cref<FlowState> flow)
1731 {
1732 printIndent(css);
1733 css << "outputC.loc[gl_LocalInvocationIndex]++;\n";
1734 printIndent(css);
1735 css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex].x = 0x" << std::hex
1736 << flow.ops[flow.opsIndex].value << ";\n";
1737 }
1738
printBallot(std::stringstream & css,add_cref<FlowState>,bool endWithSemicolon=false)1739 virtual void printBallot(std::stringstream &css, add_cref<FlowState>, bool endWithSemicolon = false)
1740 {
1741 printIndent(css);
1742
1743 css << "outputC.loc[gl_LocalInvocationIndex]++,";
1744 // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
1745 // a ballot, to make sure the ballot is "diverged enough". Don't do this for
1746 // subgroup_uniform_control_flow, since we only validate results that must be fully
1747 // reconverged.
1748 if (loopNesting > 0 && caseDef.testType == TT_MAXIMAL)
1749 {
1750 css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex] = " << getPartitionBallotText()
1751 << ".xy";
1752 }
1753 else if (caseDef.isElect())
1754 {
1755 css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex].x = elect()";
1756 }
1757 else
1758 {
1759 css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex] = subgroupBallot(true).xy";
1760 }
1761 if (endWithSemicolon)
1762 {
1763 css << ";\n";
1764 }
1765 }
1766
printCode(std::stringstream & functions,std::stringstream & main)1767 void printCode(std::stringstream &functions, std::stringstream &main)
1768 {
1769 std::stringstream *css = &main;
1770 indent = 4;
1771 loopNesting = 0;
1772 int funcNum = 0;
1773 int32_t i = 0;
1774
1775 auto makeFlowState = [&]() -> FlowState { return FlowState{ops, i, loopNesting, funcNum}; };
1776
1777 for (; i < (int32_t)ops.size(); ++i)
1778 {
1779 switch (ops[i].type)
1780 {
1781 case OP_IF_MASK:
1782 printIndent(*css);
1783 if (ops[i].value == ~0ULL)
1784 {
1785 // This equality test will always succeed, since inputA.a[i] == i
1786 int idx = deRandom_getUint32(&rnd) % 4;
1787 *css << "if (inputA.a[" << idx << "] == " << idx << ") {\n";
1788 }
1789 else
1790 {
1791 const tcu::UVec4 v(ops[i].bvalue);
1792 *css << std::hex << "if (testBit(uvec4("
1793 << "0x" << v.x() << ", "
1794 << "0x" << v.y() << ", "
1795 << "0x" << v.z() << ", "
1796 << "0x" << v.w() << std::dec << "), gl_SubgroupInvocationID)) {\n";
1797 }
1798 indent += 4;
1799 break;
1800 case OP_IF_LOOPCOUNT:
1801 printIndent(*css);
1802 *css << "if (gl_SubgroupInvocationID == loopIdx" << loopNesting - 1 << ") {\n";
1803 indent += 4;
1804 break;
1805 case OP_IF_LOCAL_INVOCATION_INDEX:
1806 printIfLocalInvocationIndex(*css, makeFlowState());
1807 indent += 4;
1808 break;
1809 case OP_ELSE_MASK:
1810 case OP_ELSE_LOOPCOUNT:
1811 case OP_ELSE_LOCAL_INVOCATION_INDEX:
1812 indent -= 4;
1813 printIndent(*css);
1814 *css << "} else {\n";
1815 indent += 4;
1816 break;
1817 case OP_ENDIF:
1818 indent -= 4;
1819 printIndent(*css);
1820 *css << "}\n";
1821 break;
1822 case OP_BALLOT:
1823 printBallot(*css, makeFlowState(), true);
1824 break;
1825 case OP_STORE:
1826 printStore(*css, makeFlowState());
1827 break;
1828 case OP_BEGIN_FOR_VAR:
1829 printIndent(*css);
1830 *css << "for (int loopIdx" << loopNesting << " = 0;\n";
1831 printIndent(*css);
1832 *css << " loopIdx" << loopNesting << " < gl_SubgroupInvocationID + 1;\n";
1833 printIndent(*css);
1834 *css << " loopIdx" << loopNesting << "++) {\n";
1835 indent += 4;
1836 loopNesting++;
1837 break;
1838 case OP_END_FOR_VAR:
1839 loopNesting--;
1840 indent -= 4;
1841 printIndent(*css);
1842 *css << "}\n";
1843 break;
1844 case OP_BEGIN_FOR_UNIF:
1845 printIndent(*css);
1846 *css << "for (int loopIdx" << loopNesting << " = 0;\n";
1847 printIndent(*css);
1848 *css << " loopIdx" << loopNesting << " < inputA.a[" << ops[i].value << "];\n";
1849 printIndent(*css);
1850 *css << " loopIdx" << loopNesting << "++) {\n";
1851 indent += 4;
1852 loopNesting++;
1853 break;
1854 case OP_END_FOR_UNIF:
1855 loopNesting--;
1856 indent -= 4;
1857 printIndent(*css);
1858 *css << "}\n";
1859 break;
1860 case OP_BEGIN_FOR_INF:
1861 printIndent(*css);
1862 *css << "for (int loopIdx" << loopNesting << " = 0;;loopIdx" << loopNesting << "++,";
1863 loopNesting++;
1864 printBallot(*css, makeFlowState());
1865 *css << ") {\n";
1866 indent += 4;
1867 break;
1868 case OP_END_FOR_INF:
1869 loopNesting--;
1870 indent -= 4;
1871 printIndent(*css);
1872 *css << "}\n";
1873 break;
1874 case OP_BEGIN_DO_WHILE_UNIF:
1875 printIndent(*css);
1876 *css << "{\n";
1877 indent += 4;
1878 printIndent(*css);
1879 *css << "int loopIdx" << loopNesting << " = 0;\n";
1880 printIndent(*css);
1881 *css << "do {\n";
1882 indent += 4;
1883 printIndent(*css);
1884 *css << "loopIdx" << loopNesting << "++;\n";
1885 loopNesting++;
1886 break;
1887 case OP_END_DO_WHILE_UNIF:
1888 loopNesting--;
1889 indent -= 4;
1890 printIndent(*css);
1891 *css << "} while (loopIdx" << loopNesting << " < inputA.a[" << ops[(uint32_t)ops[i].value].value
1892 << "]);\n";
1893 indent -= 4;
1894 printIndent(*css);
1895 *css << "}\n";
1896 break;
1897 case OP_BEGIN_DO_WHILE_INF:
1898 printIndent(*css);
1899 *css << "{\n";
1900 indent += 4;
1901 printIndent(*css);
1902 *css << "int loopIdx" << loopNesting << " = 0;\n";
1903 printIndent(*css);
1904 *css << "do {\n";
1905 indent += 4;
1906 loopNesting++;
1907 break;
1908 case OP_END_DO_WHILE_INF:
1909 loopNesting--;
1910 printIndent(*css);
1911 *css << "loopIdx" << loopNesting << "++;\n";
1912 indent -= 4;
1913 printIndent(*css);
1914 *css << "} while (true);\n";
1915 indent -= 4;
1916 printIndent(*css);
1917 *css << "}\n";
1918 break;
1919 case OP_BREAK:
1920 printIndent(*css);
1921 *css << "break;\n";
1922 break;
1923 case OP_CONTINUE:
1924 printIndent(*css);
1925 *css << "continue;\n";
1926 break;
1927 case OP_ELECT:
1928 printIndent(*css);
1929 *css << "if (subgroupElect()) {\n";
1930 indent += 4;
1931 break;
1932 case OP_RETURN:
1933 printIndent(*css);
1934 *css << "return;\n";
1935 break;
1936 case OP_CALL_BEGIN:
1937 printIndent(*css);
1938 *css << "func" << funcNum << "(";
1939 for (int32_t n = 0; n < loopNesting; ++n)
1940 {
1941 *css << "loopIdx" << n;
1942 if (n != loopNesting - 1)
1943 *css << ", ";
1944 }
1945 *css << ");\n";
1946 css = &functions;
1947 printIndent(*css);
1948 *css << "void func" << funcNum << "(";
1949 for (int32_t n = 0; n < loopNesting; ++n)
1950 {
1951 *css << "int loopIdx" << n;
1952 if (n != loopNesting - 1)
1953 *css << ", ";
1954 }
1955 *css << ") {\n";
1956 indent += 4;
1957 funcNum++;
1958 break;
1959 case OP_CALL_END:
1960 indent -= 4;
1961 printIndent(*css);
1962 *css << "}\n";
1963 css = &main;
1964 break;
1965 case OP_NOISE:
1966 if (ops[i].value == 0)
1967 {
1968 printIndent(*css);
1969 *css << "while (!subgroupElect()) {}\n";
1970 }
1971 else
1972 {
1973 printIndent(*css);
1974 *css << "if (inputA.a[0] == 12345) {\n";
1975 indent += 4;
1976 printIndent(*css);
1977 *css << "while (true) {\n";
1978 indent += 4;
1979 printBallot(*css, makeFlowState(), true);
1980 indent -= 4;
1981 printIndent(*css);
1982 *css << "}\n";
1983 indent -= 4;
1984 printIndent(*css);
1985 *css << "}\n";
1986 }
1987 break;
1988 case OP_SWITCH_UNIF_BEGIN:
1989 printIndent(*css);
1990 *css << "switch (inputA.a[" << ops[i].value << "]) {\n";
1991 indent += 4;
1992 break;
1993 case OP_SWITCH_VAR_BEGIN:
1994 printIndent(*css);
1995 *css << "switch (gl_SubgroupInvocationID & 3) {\n";
1996 indent += 4;
1997 break;
1998 case OP_SWITCH_LOOP_COUNT_BEGIN:
1999 printIndent(*css);
2000 *css << "switch (loopIdx" << ops[i].value << ") {\n";
2001 indent += 4;
2002 break;
2003 case OP_SWITCH_END:
2004 indent -= 4;
2005 printIndent(*css);
2006 *css << "}\n";
2007 break;
2008 case OP_CASE_MASK_BEGIN:
2009 for (int32_t b = 0; b < 32; ++b)
2010 {
2011 if ((1u << b) & ops[i].caseValue)
2012 {
2013 printIndent(*css);
2014 *css << "case " << b << ":\n";
2015 }
2016 }
2017 printIndent(*css);
2018 *css << "{\n";
2019 indent += 4;
2020 break;
2021 case OP_CASE_LOOP_COUNT_BEGIN:
2022 if (ops[i].caseValue == 0xFFFFFFFF)
2023 {
2024 printIndent(*css);
2025 *css << "default: {\n";
2026 }
2027 else
2028 {
2029 printIndent(*css);
2030 *css << "case " << ops[i].caseValue << ": {\n";
2031 }
2032 indent += 4;
2033 break;
2034 case OP_CASE_END:
2035 printIndent(*css);
2036 *css << "break;\n";
2037 indent -= 4;
2038 printIndent(*css);
2039 *css << "}\n";
2040 break;
2041 default:
2042 DE_ASSERT(0);
2043 break;
2044 }
2045 }
2046 }
2047
2048 // Simulate execution of the program. If countOnly is true, just return
2049 // the max number of outputs written. If it's false, store out the result
2050 // values to ref.
2051 virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) = 0;
2052
execute(qpWatchDog * watchDog,bool countOnly,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,add_cref<std::vector<uint32_t>> outputP={},const tcu::UVec4 * cmp=nullptr,const uint32_t primitiveID=(~0u))2053 virtual uint32_t execute(qpWatchDog *watchDog, bool countOnly, const uint32_t subgroupSize,
2054 const uint32_t fragmentStride, const uint32_t primitiveStride,
2055 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2056 add_cref<std::vector<uint32_t>> outputP = {}, const tcu::UVec4 *cmp = nullptr,
2057 const uint32_t primitiveID = (~0u))
2058 {
2059 // Per-invocation output location counters
2060 std::vector<uint32_t> outLoc;
2061 std::vector<SubgroupState2> stateStack;
2062 uint32_t subgroupCount;
2063 uint32_t logFailureCount;
2064 auto prerequisites = makePrerequisites(outputP, subgroupSize, fragmentStride, primitiveStride, stateStack,
2065 outLoc, subgroupCount);
2066 const Ballot fullSubgroupMask = subgroupSizeToMask(subgroupSize, subgroupCount);
2067
2068 logFailureCount = 10u;
2069 nesting = 0;
2070 loopNesting = 0;
2071
2072 int32_t i = 0;
2073 uint32_t loopCount = 0;
2074
2075 while (i < (int32_t)ops.size())
2076 {
2077 add_cref<Ballots> activeMask = stateStack[nesting].activeMask;
2078
2079 if ((loopCount % 5000) == 0 && watchDog)
2080 qpWatchDog_touch(watchDog);
2081
2082 switch (ops[i].type)
2083 {
2084 case OP_BALLOT:
2085 // Flag that this ballot is workgroup-nonuniform
2086 if (caseDef.isWUCF() && activeMask.any() && !activeMask.all())
2087 ops[i].caseValue = 1;
2088
2089 if (caseDef.isSUCF())
2090 {
2091 for (uint32_t id = 0; id < invocationStride; id += subgroupSize)
2092 {
2093 const Ballot subgroupMask = bitsetToBallot(activeMask, fullSubgroupMask, subgroupSize, id);
2094 // Flag that this ballot is subgroup-nonuniform
2095 if (subgroupMask != 0 && subgroupMask != fullSubgroupMask)
2096 ops[i].caseValue = 1;
2097 }
2098 }
2099
2100 simulateBallot(countOnly, activeMask, primitiveID, i, outLoc, ref, log, prerequisites, logFailureCount,
2101 (i > 0 ? ops[i - 1].type : OP_BALLOT), cmp);
2102 break;
2103 case OP_STORE:
2104 simulateStore(countOnly, stateStack[nesting].activeMask, primitiveID, ops[i].value, outLoc, ref, log,
2105 prerequisites, logFailureCount, (i > 0 ? ops[i - 1].type : OP_STORE), cmp);
2106 break;
2107 case OP_IF_MASK:
2108 nesting++;
2109 stateStack[nesting].activeMask =
2110 stateStack[nesting - 1].activeMask & ballotsFromBallot(ops[i].bvalue, subgroupSize, subgroupCount);
2111 stateStack[nesting].header = i;
2112 stateStack[nesting].isLoop = 0;
2113 stateStack[nesting].isSwitch = 0;
2114 break;
2115 case OP_ELSE_MASK:
2116 stateStack[nesting].activeMask =
2117 stateStack[nesting - 1].activeMask &
2118 ~ballotsFromBallot(ops[stateStack[nesting].header].bvalue, subgroupSize, subgroupCount);
2119 break;
2120 case OP_IF_LOOPCOUNT:
2121 {
2122 uint32_t n = nesting;
2123 while (!stateStack[n].isLoop)
2124 n--;
2125 const Ballot tripBallot = Ballot::withSetBit(stateStack[n].tripCount);
2126
2127 nesting++;
2128 stateStack[nesting].activeMask =
2129 stateStack[nesting - 1].activeMask & ballotsFromBallot(tripBallot, subgroupSize, subgroupCount);
2130 stateStack[nesting].header = i;
2131 stateStack[nesting].isLoop = 0;
2132 stateStack[nesting].isSwitch = 0;
2133 break;
2134 }
2135 case OP_ELSE_LOOPCOUNT:
2136 {
2137 uint32_t n = nesting;
2138 while (!stateStack[n].isLoop)
2139 n--;
2140 const Ballot tripBallot = Ballot::withSetBit(stateStack[n].tripCount);
2141
2142 stateStack[nesting].activeMask =
2143 stateStack[nesting - 1].activeMask & ~ballotsFromBallot(tripBallot, subgroupSize, subgroupCount);
2144 break;
2145 }
2146 case OP_IF_LOCAL_INVOCATION_INDEX:
2147 {
2148 // all bits >= N
2149 Ballots mask(subgroupCount);
2150 const uint32_t maxID = subgroupCount * subgroupSize;
2151 for (uint32_t id = static_cast<uint32_t>(ops[i].value); id < maxID; ++id)
2152 {
2153 mask.set(Ballots::findBit(id, subgroupSize));
2154 }
2155
2156 nesting++;
2157 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask & mask;
2158 stateStack[nesting].header = i;
2159 stateStack[nesting].isLoop = 0;
2160 stateStack[nesting].isSwitch = 0;
2161 break;
2162 }
2163 case OP_ELSE_LOCAL_INVOCATION_INDEX:
2164 {
2165 // all bits < N
2166 Ballots mask(subgroupCount);
2167 const uint32_t maxID = subgroupCount * subgroupSize;
2168 for (uint32_t id = 0u; id < static_cast<uint32_t>(ops[i].value) && id < maxID; ++id)
2169 {
2170 mask.set(Ballots::findBit(id, subgroupSize));
2171 }
2172
2173 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask & mask;
2174 break;
2175 }
2176 case OP_ENDIF:
2177 nesting--;
2178 break;
2179 case OP_BEGIN_FOR_UNIF:
2180 // XXX TODO: We don't handle a for loop with zero iterations
2181 nesting++;
2182 loopNesting++;
2183 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
2184 stateStack[nesting].header = i;
2185 stateStack[nesting].tripCount = 0;
2186 stateStack[nesting].isLoop = 1;
2187 stateStack[nesting].isSwitch = 0;
2188 stateStack[nesting].continueMask = 0;
2189 break;
2190 case OP_END_FOR_UNIF:
2191 stateStack[nesting].tripCount++;
2192 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
2193 stateStack[nesting].continueMask = 0;
2194 if (stateStack[nesting].tripCount < ops[stateStack[nesting].header].value &&
2195 stateStack[nesting].activeMask.any())
2196 {
2197 i = stateStack[nesting].header + 1;
2198 continue;
2199 }
2200 else
2201 {
2202 loopNesting--;
2203 nesting--;
2204 }
2205 break;
2206 case OP_BEGIN_DO_WHILE_UNIF:
2207 // XXX TODO: We don't handle a for loop with zero iterations
2208 nesting++;
2209 loopNesting++;
2210 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
2211 stateStack[nesting].header = i;
2212 stateStack[nesting].tripCount = 1;
2213 stateStack[nesting].isLoop = 1;
2214 stateStack[nesting].isSwitch = 0;
2215 stateStack[nesting].continueMask = 0;
2216 break;
2217 case OP_END_DO_WHILE_UNIF:
2218 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
2219 stateStack[nesting].continueMask = 0;
2220 if (stateStack[nesting].tripCount < ops[stateStack[nesting].header].value &&
2221 stateStack[nesting].activeMask.any())
2222 {
2223 i = stateStack[nesting].header + 1;
2224 stateStack[nesting].tripCount++;
2225 continue;
2226 }
2227 else
2228 {
2229 loopNesting--;
2230 nesting--;
2231 }
2232 break;
2233 case OP_BEGIN_FOR_VAR:
2234 // XXX TODO: We don't handle a for loop with zero iterations
2235 nesting++;
2236 loopNesting++;
2237 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
2238 stateStack[nesting].header = i;
2239 stateStack[nesting].tripCount = 0;
2240 stateStack[nesting].isLoop = 1;
2241 stateStack[nesting].isSwitch = 0;
2242 stateStack[nesting].continueMask = 0;
2243 break;
2244 case OP_END_FOR_VAR:
2245 {
2246 stateStack[nesting].tripCount++;
2247 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
2248 stateStack[nesting].continueMask = 0;
2249 Ballot tripBallot;
2250 if (subgroupSize != stateStack[nesting].tripCount)
2251 {
2252 for (uint32_t bit = stateStack[nesting].tripCount; bit < tripBallot.size(); ++bit)
2253 tripBallot.set(bit);
2254 }
2255 stateStack[nesting].activeMask &= ballotsFromBallot(tripBallot, subgroupSize, subgroupCount);
2256
2257 if (stateStack[nesting].activeMask.any())
2258 {
2259 i = stateStack[nesting].header + 1;
2260 continue;
2261 }
2262 else
2263 {
2264 loopNesting--;
2265 nesting--;
2266 }
2267 break;
2268 }
2269 case OP_BEGIN_FOR_INF:
2270 case OP_BEGIN_DO_WHILE_INF:
2271 nesting++;
2272 loopNesting++;
2273 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
2274 stateStack[nesting].header = i;
2275 stateStack[nesting].tripCount = 0;
2276 stateStack[nesting].isLoop = 1;
2277 stateStack[nesting].isSwitch = 0;
2278 stateStack[nesting].continueMask = 0;
2279 break;
2280 case OP_END_FOR_INF:
2281 stateStack[nesting].tripCount++;
2282 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
2283 stateStack[nesting].continueMask = 0;
2284 if (stateStack[nesting].activeMask.any())
2285 {
2286 // output expected OP_BALLOT values
2287 simulateBallot(countOnly, stateStack[nesting].activeMask, primitiveID, i, outLoc, ref, log,
2288 prerequisites, logFailureCount, (i > 0 ? ops[i - 1].type : OP_BALLOT), cmp);
2289
2290 i = stateStack[nesting].header + 1;
2291 continue;
2292 }
2293 else
2294 {
2295 loopNesting--;
2296 nesting--;
2297 }
2298 break;
2299 case OP_END_DO_WHILE_INF:
2300 stateStack[nesting].tripCount++;
2301 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
2302 stateStack[nesting].continueMask = 0;
2303 if (stateStack[nesting].activeMask.any())
2304 {
2305 i = stateStack[nesting].header + 1;
2306 continue;
2307 }
2308 else
2309 {
2310 loopNesting--;
2311 nesting--;
2312 }
2313 break;
2314 case OP_BREAK:
2315 {
2316 uint32_t n = nesting;
2317 const Ballots mask = stateStack[nesting].activeMask;
2318 while (true)
2319 {
2320 stateStack[n].activeMask &= ~mask;
2321 if (stateStack[n].isLoop || stateStack[n].isSwitch)
2322 break;
2323
2324 n--;
2325 }
2326 }
2327 break;
2328 case OP_CONTINUE:
2329 {
2330 uint32_t n = nesting;
2331 const Ballots mask = stateStack[nesting].activeMask;
2332 while (true)
2333 {
2334 stateStack[n].activeMask &= ~mask;
2335 if (stateStack[n].isLoop)
2336 {
2337 stateStack[n].continueMask |= mask;
2338 break;
2339 }
2340 n--;
2341 }
2342 }
2343 break;
2344 case OP_ELECT:
2345 {
2346 nesting++;
2347 stateStack[nesting].activeMask = bitsetElect(stateStack[nesting - 1].activeMask);
2348 stateStack[nesting].header = i;
2349 stateStack[nesting].isLoop = 0;
2350 stateStack[nesting].isSwitch = 0;
2351 }
2352 break;
2353 case OP_RETURN:
2354 {
2355 const Ballots mask = stateStack[nesting].activeMask;
2356 for (int32_t n = nesting; n >= 0; --n)
2357 {
2358 stateStack[n].activeMask &= ~mask;
2359 if (stateStack[n].isCall)
2360 break;
2361 }
2362 }
2363 break;
2364
2365 case OP_CALL_BEGIN:
2366 nesting++;
2367 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
2368 stateStack[nesting].isLoop = 0;
2369 stateStack[nesting].isSwitch = 0;
2370 stateStack[nesting].isCall = 1;
2371 break;
2372 case OP_CALL_END:
2373 stateStack[nesting].isCall = 0;
2374 nesting--;
2375 break;
2376 case OP_NOISE:
2377 break;
2378
2379 case OP_SWITCH_UNIF_BEGIN:
2380 case OP_SWITCH_VAR_BEGIN:
2381 case OP_SWITCH_LOOP_COUNT_BEGIN:
2382 nesting++;
2383 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
2384 stateStack[nesting].header = i;
2385 stateStack[nesting].isLoop = 0;
2386 stateStack[nesting].isSwitch = 1;
2387 break;
2388 case OP_SWITCH_END:
2389 nesting--;
2390 break;
2391 case OP_CASE_MASK_BEGIN:
2392 stateStack[nesting].activeMask =
2393 stateStack[nesting - 1].activeMask & ballotsFromBallot(ops[i].bvalue, subgroupSize, subgroupCount);
2394 break;
2395 case OP_CASE_LOOP_COUNT_BEGIN:
2396 {
2397 uint32_t n = nesting;
2398 uint32_t l = loopNesting;
2399
2400 while (true)
2401 {
2402 if (stateStack[n].isLoop)
2403 {
2404 l--;
2405 if (l == ops[stateStack[nesting].header].value)
2406 break;
2407 }
2408 n--;
2409 }
2410
2411 if ((Ballot::withSetBit(stateStack[n].tripCount) & Ballot(ops[i].bvalue)).any())
2412 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
2413 else
2414 stateStack[nesting].activeMask = 0;
2415 break;
2416 }
2417 case OP_CASE_END:
2418 break;
2419
2420 default:
2421 DE_ASSERT(0);
2422 break;
2423 }
2424 i++;
2425 loopCount++;
2426 }
2427 uint32_t maxLoc = 0;
2428 for (uint32_t id = 0; id < (uint32_t)outLoc.size(); ++id)
2429 maxLoc = de::max(maxLoc, outLoc[id]);
2430
2431 return maxLoc;
2432 }
2433
hasUCF() const2434 bool hasUCF() const
2435 {
2436 for (int32_t i = 0; i < (int32_t)ops.size(); ++i)
2437 {
2438 if (ops[i].type == OP_BALLOT && ops[i].caseValue == 0)
2439 return true;
2440 }
2441 return false;
2442 }
2443
2444 protected:
makePrerequisites(add_cref<std::vector<uint32_t>> outputP,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<SubgroupState2>> stateStack,add_ref<std::vector<uint32_t>> outLoc,add_ref<uint32_t> subgroupCount)2445 virtual std::shared_ptr<Prerequisites> makePrerequisites(add_cref<std::vector<uint32_t>> outputP,
2446 const uint32_t subgroupSize, const uint32_t fragmentStride,
2447 const uint32_t primitiveStride,
2448 add_ref<std::vector<SubgroupState2>> stateStack,
2449 add_ref<std::vector<uint32_t>> outLoc,
2450 add_ref<uint32_t> subgroupCount)
2451 {
2452 DE_UNREF(outputP);
2453 DE_UNREF(subgroupSize);
2454 DE_UNREF(fragmentStride);
2455 DE_UNREF(primitiveStride);
2456 DE_UNREF(stateStack);
2457 DE_UNREF(outLoc);
2458 DE_UNREF(subgroupCount);
2459 return std::make_shared<Prerequisites>();
2460 }
2461
simulateBallot(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t primitiveID,const int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)2462 virtual void simulateBallot(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t primitiveID,
2463 const int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
2464 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2465 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
2466 const OPType reason, const tcu::UVec4 *cmp)
2467 {
2468 DE_UNREF(countOnly);
2469 DE_UNREF(activeMask);
2470 DE_UNREF(primitiveID);
2471 DE_UNREF(opsIndex);
2472 DE_UNREF(outLoc);
2473 DE_UNREF(ref);
2474 DE_UNREF(log);
2475 DE_UNREF(prerequisites);
2476 DE_UNREF(logFailureCount);
2477 DE_UNREF(reason);
2478 DE_UNREF(cmp);
2479 }
2480
simulateStore(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t primitiveID,const uint64_t storeValue,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)2481 virtual void simulateStore(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t primitiveID,
2482 const uint64_t storeValue, add_ref<std::vector<uint32_t>> outLoc,
2483 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2484 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
2485 const OPType reason, const tcu::UVec4 *cmp)
2486 {
2487 DE_UNREF(countOnly);
2488 DE_UNREF(activeMask);
2489 DE_UNREF(primitiveID);
2490 DE_UNREF(storeValue);
2491 DE_UNREF(outLoc);
2492 DE_UNREF(ref);
2493 DE_UNREF(log);
2494 DE_UNREF(prerequisites);
2495 DE_UNREF(logFailureCount);
2496 DE_UNREF(reason);
2497 DE_UNREF(cmp);
2498 }
2499 };
2500
2501 class ComputeRandomProgram : public RandomProgram
2502 {
2503 public:
ComputeRandomProgram(const CaseDef & c)2504 ComputeRandomProgram(const CaseDef &c) : RandomProgram(c, uint32_t(c.sizeX * c.sizeY))
2505 {
2506 DE_ASSERT(c.shaderStage == VK_SHADER_STAGE_COMPUTE_BIT);
2507 }
2508 virtual ~ComputeRandomProgram() = default;
2509
simulate(bool countOnly,uint32_t subgroupSize,add_ref<std::vector<uint64_t>> ref)2510 virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) override
2511 {
2512 DE_ASSERT(false);
2513 // Do not use this method, to simulate generated program use simulate2 instead
2514 DE_UNREF(countOnly);
2515 DE_UNREF(subgroupSize);
2516 DE_UNREF(ref);
2517 return 0;
2518 }
2519
2520 struct ComputePrerequisites : Prerequisites
2521 {
2522 const uint32_t subgroupSize;
2523 const uint32_t subgroupCount;
2524 const Ballot subgroupSizeMask;
2525 std::vector<std::pair<bool, tcu::UVec4>> ballots;
ComputePrerequisitesvkt::Reconvergence::__anone030def80111::ComputeRandomProgram::ComputePrerequisites2526 ComputePrerequisites(uint32_t subgroupSize_, uint32_t subgroupCount_)
2527 : subgroupSize(subgroupSize_)
2528 , subgroupCount(subgroupCount_)
2529 , subgroupSizeMask(subgroupSizeToMask(subgroupSize, subgroupCount))
2530 , ballots(subgroupCount_)
2531 {
2532 }
2533 };
2534
printBallot(add_ref<std::stringstream> css,add_cref<FlowState>,bool endWithSemicolon=false)2535 virtual void printBallot(add_ref<std::stringstream> css, add_cref<FlowState>,
2536 bool endWithSemicolon = false) override
2537 {
2538 printIndent(css);
2539
2540 css << "outputC.loc[gl_LocalInvocationIndex]++,";
2541 // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
2542 // a ballot, to make sure the ballot is "diverged enough". Don't do this for
2543 // subgroup_uniform_control_flow, since we only validate results that must be fully
2544 // reconverged.
2545 if (loopNesting > 0 && caseDef.testType == TT_MAXIMAL)
2546 {
2547 css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex] = " << getPartitionBallotText();
2548 }
2549 else if (caseDef.isElect())
2550 {
2551 css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex].x = elect()";
2552 }
2553 else
2554 {
2555 css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex] = subgroupBallot(true)";
2556 }
2557 if (endWithSemicolon)
2558 {
2559 css << ";\n";
2560 }
2561 }
2562
2563 protected:
simulateBallot(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t unusedPrimitiveID,const int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)2564 virtual void simulateBallot(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t unusedPrimitiveID,
2565 const int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
2566 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2567 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
2568 const OPType reason, const tcu::UVec4 *cmp) override
2569 {
2570 DE_UNREF(unusedPrimitiveID);
2571 DE_UNREF(log);
2572 DE_UNREF(logFailureCount);
2573 DE_UNREF(reason);
2574 DE_UNREF(cmp);
2575 auto pre = static_pointer_cast<ComputePrerequisites>(prerequisites);
2576 const uint32_t subgroupCount = activeMask.subgroupCount();
2577 const uint32_t subgroupSize = pre->subgroupSize;
2578
2579 std::fill_n(pre->ballots.begin(), subgroupCount, std::pair<bool, tcu::UVec4>());
2580
2581 for (uint32_t id = 0; id < invocationStride; ++id)
2582 {
2583 if (activeMask.test((Ballots::findBit(id, subgroupSize))))
2584 {
2585 if (countOnly)
2586 {
2587 outLoc[id]++;
2588 }
2589 else
2590 {
2591 if (ops[opsIndex].caseValue)
2592 {
2593 // Emit a magic value to indicate that we shouldn't validate this ballot
2594 ref[(outLoc[id]++) * invocationStride + id] =
2595 bitsetToBallot(0x12345678, subgroupCount, subgroupSize, id);
2596 }
2597 else
2598 {
2599 add_ref<std::pair<bool, tcu::UVec4>> info(pre->ballots.at(id / subgroupSize));
2600 if (false == info.first)
2601 {
2602 info.first = true;
2603 info.second = bitsetToBallot(activeMask, pre->subgroupSizeMask, subgroupSize, id);
2604 }
2605 ref[(outLoc[id]++) * invocationStride + id] = info.second;
2606 }
2607 }
2608 }
2609 }
2610 }
2611
simulateStore(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t unusedPrimitiveID,const uint64_t storeValue,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)2612 virtual void simulateStore(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t unusedPrimitiveID,
2613 const uint64_t storeValue, add_ref<std::vector<uint32_t>> outLoc,
2614 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2615 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
2616 const OPType reason, const tcu::UVec4 *cmp) override
2617 {
2618 DE_UNREF(unusedPrimitiveID);
2619 DE_UNREF(log);
2620 DE_UNREF(logFailureCount);
2621 DE_UNREF(reason);
2622 DE_UNREF(cmp);
2623 const uint32_t subgroupSize = static_pointer_cast<ComputePrerequisites>(prerequisites)->subgroupSize;
2624 for (uint32_t id = 0; id < invocationStride; ++id)
2625 {
2626 if (activeMask.test(Ballots::findBit(id, subgroupSize)))
2627 {
2628 if (countOnly)
2629 outLoc[id]++;
2630 else
2631 ref[(outLoc[id]++) * invocationStride + id][0] = uint32_t(storeValue & 0xFFFFFFFF);
2632 }
2633 }
2634 }
2635
makePrerequisites(add_cref<std::vector<uint32_t>> outputP,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<SubgroupState2>> stateStack,add_ref<std::vector<uint32_t>> outLoc,add_ref<uint32_t> subgroupCount)2636 virtual std::shared_ptr<Prerequisites> makePrerequisites(add_cref<std::vector<uint32_t>> outputP,
2637 const uint32_t subgroupSize, const uint32_t fragmentStride,
2638 const uint32_t primitiveStride,
2639 add_ref<std::vector<SubgroupState2>> stateStack,
2640 add_ref<std::vector<uint32_t>> outLoc,
2641 add_ref<uint32_t> subgroupCount) override
2642 {
2643 DE_UNREF(outputP);
2644 DE_UNREF(fragmentStride);
2645 DE_ASSERT(invocationStride == primitiveStride);
2646 subgroupCount = ROUNDUP(invocationStride, subgroupSize) / subgroupSize;
2647 auto prerequisites = std::make_shared<ComputePrerequisites>(subgroupSize, subgroupCount);
2648 stateStack.resize(10u, SubgroupState2(subgroupCount));
2649 outLoc.resize(primitiveStride, 0u);
2650 add_ref<Ballots> activeMask(stateStack.at(0).activeMask);
2651 for (uint32_t id = 0; id < invocationStride; ++id)
2652 {
2653 activeMask.set(Ballots::findBit(id, subgroupSize));
2654 }
2655 return prerequisites;
2656 }
2657 };
2658
2659 class FragmentRandomProgram : public RandomProgram
2660 {
2661 public:
2662 #define BALLOT_STACK_SIZE_DEFVAL_LINE (__LINE__ + 1)
2663 static constexpr const uint32_t experimentalOutLocSize = 16384;
2664 static constexpr const uint32_t conditionIfInvocationStride = 511u;
FragmentRandomProgram(const CaseDef & c)2665 FragmentRandomProgram(const CaseDef &c) : RandomProgram(c, conditionIfInvocationStride)
2666 {
2667 DE_ASSERT(caseDef.testType == TT_MAXIMAL);
2668 DE_ASSERT(c.shaderStage == VK_SHADER_STAGE_FRAGMENT_BIT);
2669 }
2670 virtual ~FragmentRandomProgram() = default;
2671
create(const CaseDef & c)2672 static de::MovePtr<FragmentRandomProgram> create(const CaseDef &c)
2673 {
2674 return de::MovePtr<FragmentRandomProgram>(new FragmentRandomProgram(c));
2675 }
2676
printIfLocalInvocationIndex(add_ref<std::stringstream> css,add_cref<FlowState> flow)2677 virtual void printIfLocalInvocationIndex(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
2678 {
2679 printIndent(css);
2680 css << "if (invocationIndex() >= inputA.a[0x" << std::hex << flow.ops[flow.opsIndex].value << "]) {\n";
2681 }
2682
printStore(add_ref<std::stringstream> css,add_cref<FlowState> flow)2683 virtual void printStore(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
2684 {
2685 printIndent(css);
2686 css << "storeValue(outLoc++, 0x" << std::hex << flow.ops[flow.opsIndex].value << ");\n";
2687 }
2688
printBallot(add_ref<std::stringstream> css,add_cref<FlowState>,bool endWidthSemicolon=false)2689 virtual void printBallot(add_ref<std::stringstream> css, add_cref<FlowState>,
2690 bool endWidthSemicolon = false) override
2691 {
2692 printIndent(css);
2693 // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
2694 // a ballot, to make sure the ballot is "diverged enough". Don't do this for
2695 // subgroup_uniform_control_flow, since we only validate results that must be fully
2696 // reconverged.
2697 if (loopNesting > 0)
2698 {
2699 css << "storeBallot(outLoc++)";
2700 }
2701 else
2702 {
2703 css << getPartitionBallotText();
2704 }
2705 if (endWidthSemicolon)
2706 {
2707 css << ";\n";
2708 }
2709 }
2710
getPartitionBallotText()2711 virtual std::string getPartitionBallotText() override
2712 {
2713 return "storeBallot(outLoc++)";
2714 }
2715
genIf(IFType ifType,uint32_t maxLocalIndexCmp=0u)2716 virtual void genIf(IFType ifType, uint32_t maxLocalIndexCmp = 0u) override
2717 {
2718 DE_UNREF(maxLocalIndexCmp);
2719 RandomProgram::genIf(ifType, conditionIfInvocationStride);
2720 }
2721
2722 struct Arrangement : Prerequisites, ReconvergenceTestFragmentInstance::Arrangement
2723 {
2724 const uint32_t m_width;
2725 const uint32_t m_height;
2726 const uint32_t m_subgroupSize;
2727 const uint32_t m_fragmentStride;
2728 const uint32_t m_primitiveStride;
2729 const uint32_t m_subgroupCount;
2730 const Ballots m_initialBallots;
2731 const Ballots m_nonHelperInitialBallots;
2732 const uint32_t m_invocationStride;
2733 const std::vector<std::vector<uint32_t>> m_fragmentSubgroups;
Arrangementvkt::Reconvergence::__anone030def80111::FragmentRandomProgram::Arrangement2734 Arrangement(add_cref<std::vector<uint32_t>> info, uint32_t width, uint32_t height, uint32_t subgroupSize,
2735 uint32_t primitiveStride)
2736 : m_width(width)
2737 , m_height(height)
2738 , m_subgroupSize(subgroupSize)
2739 , m_fragmentStride(width * height)
2740 , m_primitiveStride(primitiveStride)
2741 , m_subgroupCount(calcSubgroupCount(info, primitiveStride, m_fragmentStride))
2742 , m_initialBallots(makeInitialBallots(info, primitiveStride, m_fragmentStride, false))
2743 , m_nonHelperInitialBallots(makeInitialBallots(info, primitiveStride, m_fragmentStride, true))
2744 , m_invocationStride(calcInvocationStride(info, subgroupSize, primitiveStride, m_fragmentStride))
2745 , m_fragmentSubgroups(makeFragmentSubgroups(info, subgroupSize, primitiveStride, m_fragmentStride))
2746 {
2747 }
calcSubgroupCountvkt::Reconvergence::__anone030def80111::FragmentRandomProgram::Arrangement2748 static uint32_t calcSubgroupCount(add_cref<std::vector<uint32_t>> info, const uint32_t primitiveStride,
2749 const uint32_t fragmentStride)
2750 {
2751 const uint32_t cc = fragmentStride * primitiveStride;
2752 std::set<uint32_t> s;
2753 uint32_t subgroupID;
2754 uint32_t subgroupInvocationID;
2755 uint32_t isHelperInvocation;
2756 for (uint32_t c = 0u; c < cc; ++c)
2757 {
2758 if (validID(info.at(c), subgroupID, subgroupInvocationID, isHelperInvocation))
2759 s.insert(subgroupID);
2760 }
2761 const uint32_t gMin = *s.begin();
2762 DE_UNREF(gMin);
2763 const uint32_t gMax = *std::next(s.begin(), (s.size() - 1u));
2764 DE_UNREF(gMax);
2765 DE_ASSERT(gMin == 0u);
2766 DE_ASSERT(gMax == (s.size() - 1u));
2767 return static_cast<uint32_t>(s.size());
2768 }
calcInvocationStridevkt::Reconvergence::__anone030def80111::FragmentRandomProgram::Arrangement2769 static uint32_t calcInvocationStride(add_cref<std::vector<uint32_t>> info, const uint32_t subgroupSize,
2770 const uint32_t primitiveStride, const uint32_t fragmentStride)
2771 {
2772 return calcSubgroupCount(info, fragmentStride, primitiveStride) * subgroupSize;
2773 }
makeInitialBallotsvkt::Reconvergence::__anone030def80111::FragmentRandomProgram::Arrangement2774 static Ballots makeInitialBallots(add_cref<std::vector<uint32_t>> info, const uint32_t primitiveStride,
2775 const uint32_t fragmentStride, bool excludeHelpers)
2776 {
2777 uint32_t subgroupID;
2778 uint32_t subgroupInvocationID;
2779 uint32_t isHelperInvocation;
2780 Ballots b(calcSubgroupCount(info, fragmentStride, primitiveStride));
2781 const uint32_t cc = fragmentStride * primitiveStride;
2782 for (uint32_t c = 0u; c < cc; ++c)
2783 {
2784 if (validID(info.at(c), subgroupID, subgroupInvocationID, isHelperInvocation))
2785 {
2786 if (!(excludeHelpers && (isHelperInvocation != 0)))
2787 b.at(subgroupID).set(subgroupInvocationID);
2788 }
2789 }
2790 return b;
2791 }
2792 // Fully Qualified Invocation Name
fqinvkt::Reconvergence::__anone030def80111::FragmentRandomProgram::Arrangement2793 static uint32_t fqin(uint32_t maybeHelperFragmentFQIN, add_ref<uint32_t> isHelperInvocation)
2794 {
2795 isHelperInvocation = maybeHelperFragmentFQIN >> 31;
2796 return (maybeHelperFragmentFQIN & 0x7FFFFFFF);
2797 }
makeFragmentSubgroupsvkt::Reconvergence::__anone030def80111::FragmentRandomProgram::Arrangement2798 static auto makeFragmentSubgroups(add_cref<std::vector<uint32_t>> info, const uint32_t subgroupSize,
2799 const uint32_t primitiveStride, const uint32_t fragmentStride)
2800 -> std::vector<std::vector<uint32_t>>
2801 {
2802 const uint32_t subgroupCount = calcSubgroupCount(info, fragmentStride, primitiveStride);
2803 std::vector<std::vector<uint32_t>> map(primitiveStride);
2804 for (uint32_t p = 0u; p < primitiveStride; ++p)
2805 map[p].resize(fragmentStride, (subgroupCount * subgroupSize));
2806
2807 uint32_t subgroupID;
2808 uint32_t subgroupInvocationID;
2809 uint32_t isHelperInvocation;
2810 for (uint32_t p = 0u; p < primitiveStride; ++p)
2811 for (uint32_t f = 0u; f < fragmentStride; ++f)
2812 {
2813 const uint32_t sgid = info.at(f * primitiveStride + p);
2814 if (validID(sgid, subgroupID, subgroupInvocationID, isHelperInvocation))
2815 map.at(p).at(f) =
2816 (subgroupID * subgroupSize + subgroupInvocationID) | (isHelperInvocation << 31);
2817 }
2818 return map;
2819 }
calcRealInvocationCountvkt::Reconvergence::__anone030def80111::FragmentRandomProgram::Arrangement2820 static uint32_t calcRealInvocationCount(add_cref<std::vector<uint32_t>> info, uint32_t primitiveStride,
2821 uint32_t fragmentStride)
2822 {
2823 const uint32_t cc = fragmentStride * primitiveStride;
2824 uint32_t n = 0u;
2825 for (uint32_t c = 0u; c < cc; ++c)
2826 {
2827 if (info[c])
2828 ++n;
2829 }
2830 return n;
2831 }
2832
2833 private:
validIDvkt::Reconvergence::__anone030def80111::FragmentRandomProgram::Arrangement2834 static bool validID(const uint32_t id)
2835 {
2836 uint32_t subgroupID;
2837 DE_UNREF(subgroupID);
2838 uint32_t subgroupInvocationID;
2839 DE_UNREF(subgroupInvocationID);
2840 uint32_t isHelperInvocation;
2841 DE_UNREF(isHelperInvocation);
2842 return validID(id, subgroupID, subgroupInvocationID, isHelperInvocation);
2843 }
validIDvkt::Reconvergence::__anone030def80111::FragmentRandomProgram::Arrangement2844 static bool validID(const uint32_t id, add_ref<uint32_t> subgroupID, add_ref<uint32_t> subgroupInvocationID,
2845 add_ref<uint32_t> isHelperInvocation)
2846 {
2847 if (id != 0u)
2848 {
2849 subgroupInvocationID = (id & 0xFFFF);
2850 subgroupID = ((id >> 16) & 0x7FFF) - 1u;
2851 isHelperInvocation = (id >> 31);
2852 return true;
2853 }
2854 return false;
2855 }
2856 };
2857
simulate(bool countOnly,uint32_t subgroupSize,add_ref<std::vector<uint64_t>> ref)2858 virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) override
2859 {
2860 DE_ASSERT(false); // use overloaded version of simulate() instead
2861 DE_UNREF(countOnly);
2862 DE_UNREF(subgroupSize);
2863 DE_UNREF(ref);
2864 return 0;
2865 }
2866
2867 // Simulate execution of the program. If countOnly is true, just return
2868 // the max number of outputs written. If it's false, store out the result
2869 // values to ref.
execute(qpWatchDog * watchDog,bool countOnly,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,add_cref<std::vector<uint32_t>> outputP,const tcu::UVec4 * cmp=nullptr,const uint32_t reserved=(~0u))2870 virtual uint32_t execute(qpWatchDog *watchDog, bool countOnly, const uint32_t subgroupSize,
2871 const uint32_t fragmentStride, const uint32_t primitiveStride,
2872 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2873 add_cref<std::vector<uint32_t>> outputP, const tcu::UVec4 *cmp = nullptr,
2874 const uint32_t reserved = (~0u)) override
2875 {
2876 DE_UNREF(reserved);
2877 uint32_t outLocs = 0u;
2878 uint32_t maxOutLocs = 0u;
2879 for (uint32_t primitiveID = 0u; primitiveID < primitiveStride; ++primitiveID)
2880 {
2881 outLocs = RandomProgram::execute(watchDog, countOnly, subgroupSize, fragmentStride, primitiveStride, ref,
2882 log, outputP, cmp, primitiveID);
2883 maxOutLocs = std::max(outLocs, maxOutLocs);
2884 }
2885 return maxOutLocs;
2886 }
2887
2888 protected:
simulateStore(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t primitiveID,const uint64_t storeValue,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)2889 virtual void simulateStore(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t primitiveID,
2890 const uint64_t storeValue, add_ref<std::vector<uint32_t>> outLoc,
2891 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2892 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
2893 const OPType reason, const tcu::UVec4 *cmp) override
2894 {
2895 uint32_t isHelperInvocation;
2896 add_cref<Arrangement> a(*std::static_pointer_cast<Arrangement>(prerequisites));
2897 for (const uint32_t id : a.m_fragmentSubgroups.at(primitiveID))
2898 {
2899 const uint32_t sgid = a.fqin(id, isHelperInvocation);
2900 if (sgid >= (a.m_subgroupCount * a.m_subgroupSize))
2901 continue;
2902 if (false == activeMask.test(Ballots::findBit(sgid, a.m_subgroupSize)))
2903 continue;
2904 const uint32_t loc = primitiveID * a.m_subgroupCount * 128 + sgid;
2905 const uint32_t index = ((outLoc.at(loc)++) * (a.m_primitiveStride * a.m_subgroupCount * 128) +
2906 (primitiveID * a.m_subgroupCount * 128) + sgid);
2907 if (false == countOnly)
2908 {
2909 ref.at(index) = tcu::UVec4(uint32_t(storeValue & 0xFFFFFFFF), 0u, 0u, 0u);
2910 if (cmp && logFailureCount > 0u && cmp[index] != ref.at(index))
2911 {
2912 logFailureCount -= 1u;
2913 log << tcu::TestLog::Message << logFailureCount << ": stored value mismatch from "
2914 << OPtypeToStr(reason) << tcu::TestLog::EndMessage;
2915 }
2916 }
2917 }
2918 }
2919
simulateBallot(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t primitiveID,const int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)2920 virtual void simulateBallot(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t primitiveID,
2921 const int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
2922 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2923 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
2924 const OPType reason, const tcu::UVec4 *cmp) override
2925 {
2926 DE_UNREF(opsIndex);
2927 uint32_t isHelperInvocation;
2928 add_cref<Arrangement> a(*std::static_pointer_cast<Arrangement>(prerequisites));
2929 for (const uint32_t id : a.m_fragmentSubgroups.at(primitiveID))
2930 {
2931 const uint32_t sgid = a.fqin(id, isHelperInvocation);
2932 if (sgid >= (a.m_subgroupCount * a.m_subgroupSize))
2933 continue;
2934 if (false == activeMask.test(Ballots::findBit(sgid, a.m_subgroupSize)))
2935 continue;
2936 const uint32_t loc = primitiveID * a.m_subgroupCount * 128 + sgid;
2937 const uint32_t index = ((outLoc.at(loc)++) * (a.m_primitiveStride * a.m_subgroupCount * 128) +
2938 (primitiveID * a.m_subgroupCount * 128) + sgid);
2939 if (false == countOnly)
2940 {
2941 ref.at(index) = Ballot(activeMask.at(sgid / a.m_subgroupSize));
2942 if (cmp && logFailureCount > 0u && cmp[index] != ref.at(index))
2943 {
2944 logFailureCount -= 1u;
2945 log << tcu::TestLog::Message << logFailureCount << ": ballot mismatch from " << OPtypeToStr(reason)
2946 << tcu::TestLog::EndMessage;
2947 }
2948 }
2949 }
2950 }
2951
makePrerequisites(add_cref<std::vector<uint32_t>> outputP,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<SubgroupState2>> stateStack,add_ref<std::vector<uint32_t>> outLoc,add_ref<uint32_t> subgroupCount)2952 virtual std::shared_ptr<Prerequisites> makePrerequisites(add_cref<std::vector<uint32_t>> outputP,
2953 const uint32_t subgroupSize, const uint32_t fragmentStride,
2954 const uint32_t primitiveStride,
2955 add_ref<std::vector<SubgroupState2>> stateStack,
2956 add_ref<std::vector<uint32_t>> outLoc,
2957 add_ref<uint32_t> subgroupCount) override
2958 {
2959 auto prerequisites = std::make_shared<Arrangement>(outputP, fragmentStride, 1u, subgroupSize, primitiveStride);
2960 subgroupCount = prerequisites->m_subgroupCount;
2961 stateStack.resize(10u, SubgroupState2(subgroupCount));
2962 outLoc.resize((subgroupCount * 128u * fragmentStride), 0u);
2963 stateStack.at(0).activeMask = prerequisites->m_initialBallots;
2964 return prerequisites;
2965 }
2966 };
2967
2968 class VertexRandomProgram : public RandomProgram
2969 {
2970 public:
2971 static const constexpr uint32_t fillPercentage = 73u;
VertexRandomProgram(add_cref<CaseDef> c)2972 VertexRandomProgram(add_cref<CaseDef> c)
2973 : RandomProgram(c,
2974 static_cast<uint32_t>(Arrangement::generatePrimitives(c.sizeX, c.sizeY, fillPercentage).size()))
2975 {
2976 DE_ASSERT(c.shaderStage == VK_SHADER_STAGE_VERTEX_BIT);
2977 }
2978 virtual ~VertexRandomProgram() = default;
2979
2980 struct Arrangement : Prerequisites
2981 {
2982 static constexpr uint32_t NUM_SUBGROUPS_OFFSET = 0u;
2983 static constexpr uint32_t SUBGROUP_SIZE_OFFSET = 1u;
2984 static constexpr uint32_t INVOCATION_COUNT_OFFSET = 2u;
2985 static constexpr uint32_t INVOCATION_ENTRIES_OFFSET = 3u;
2986
2987 const uint32_t m_subgroupSize;
2988 const uint32_t m_primitiveStride;
2989 const uint32_t m_subgroupCount;
2990 const Ballots m_initialBallots;
2991 const uint32_t m_invocationStride;
2992 const std::vector<uint32_t> m_primitiveSubgroups;
Arrangementvkt::Reconvergence::__anone030def80111::VertexRandomProgram::Arrangement2993 Arrangement(add_cref<std::vector<uint32_t>> outputP, uint32_t subgroupSize, uint32_t primitiveStride)
2994 : m_subgroupSize(subgroupSize)
2995 , m_primitiveStride(primitiveStride)
2996 , m_subgroupCount(calcSubgroupCount(outputP))
2997 , m_initialBallots(makeInitialBallots(subgroupSize, primitiveStride, outputP))
2998 , m_invocationStride(primitiveStride)
2999 , m_primitiveSubgroups(makePrimitiveSubgroups(subgroupSize, primitiveStride, outputP))
3000 {
3001 }
calcSubgroupCountvkt::Reconvergence::__anone030def80111::VertexRandomProgram::Arrangement3002 static uint32_t calcSubgroupCount(add_cref<std::vector<uint32_t>> outputP)
3003 {
3004 return outputP.at(NUM_SUBGROUPS_OFFSET);
3005 }
calcSubgroupSizevkt::Reconvergence::__anone030def80111::VertexRandomProgram::Arrangement3006 static uint32_t calcSubgroupSize(add_cref<std::vector<uint32_t>> outputP)
3007 {
3008 return outputP.at(SUBGROUP_SIZE_OFFSET);
3009 }
calcSubgroupInvocationStridevkt::Reconvergence::__anone030def80111::VertexRandomProgram::Arrangement3010 static uint32_t calcSubgroupInvocationStride(add_cref<std::vector<uint32_t>> outputP)
3011 {
3012 return outputP.at(INVOCATION_COUNT_OFFSET);
3013 }
makeInitialBallotsvkt::Reconvergence::__anone030def80111::VertexRandomProgram::Arrangement3014 static Ballots makeInitialBallots(uint32_t subgroupSize, uint32_t primitiveStride,
3015 add_cref<std::vector<uint32_t>> outputP)
3016 {
3017 DE_UNREF(subgroupSize);
3018 const uint32_t subgroupCount = calcSubgroupCount(outputP);
3019 Ballots initialBallots(subgroupCount);
3020 for (uint32_t primitiveID = 0u; primitiveID < primitiveStride; ++primitiveID)
3021 {
3022 const uint32_t id = outputP.at(primitiveID + INVOCATION_ENTRIES_OFFSET);
3023 if (id)
3024 {
3025 const uint32_t subgroupID = (id >> 16) - 1u;
3026 const uint32_t subgroupInvocationID = id & 0xFFFF;
3027 DE_ASSERT(subgroupID < subgroupCount);
3028 DE_ASSERT(subgroupInvocationID < subgroupSize);
3029 initialBallots.at(subgroupID).set(subgroupInvocationID);
3030 }
3031 }
3032 return initialBallots;
3033 }
makePrimitiveSubgroupsvkt::Reconvergence::__anone030def80111::VertexRandomProgram::Arrangement3034 static std::vector<uint32_t> makePrimitiveSubgroups(uint32_t subgroupSize, uint32_t primitiveStride,
3035 add_cref<std::vector<uint32_t>> outputP)
3036 {
3037 std::vector<uint32_t> map(primitiveStride);
3038 for (uint32_t primitiveID = 0u; primitiveID < primitiveStride; ++primitiveID)
3039 {
3040 const uint32_t id = outputP.at(primitiveID + INVOCATION_ENTRIES_OFFSET);
3041 if (id)
3042 {
3043 const uint32_t subgroupID = (id >> 16) - 1u;
3044 const uint32_t subgroupInvocationID = id & 0xFFFF;
3045 DE_ASSERT(subgroupInvocationID < subgroupSize);
3046 map.at(primitiveID) = subgroupID * subgroupSize + subgroupInvocationID;
3047 }
3048 }
3049 return map;
3050 }
generatePrimitivesvkt::Reconvergence::__anone030def80111::VertexRandomProgram::Arrangement3051 static std::vector<tcu::Vec4> generatePrimitives(uint32_t width, uint32_t height, uint32_t fillPercent)
3052 {
3053 deRandom rnd;
3054 std::map<uint32_t, int> map;
3055 std::vector<tcu::Vec4> points;
3056 const uint32_t frags = (width * height);
3057 const uint32_t total = (frags * fillPercent) / 100u;
3058
3059 deRandom_init(&rnd, (width * height));
3060
3061 for (uint32_t i = 0u; i < total; ++i)
3062 {
3063 const uint32_t r = deRandom_getUint32(&rnd) % frags;
3064 if (map[r] != 0)
3065 {
3066 i -= 1;
3067 continue;
3068 }
3069 map[r] = 1;
3070
3071 uint32_t y = r / width;
3072 uint32_t x = r % width;
3073 float xx = (float(x) + float(x + 1)) / (2.0f * float(width));
3074 float yy = (float(y) + float(y + 1)) / (2.0f * float(height));
3075 float xxx = xx * 2.0f - 1.0f;
3076 float yyy = yy * 2.0f - 1.0f;
3077 points.emplace_back(tcu::Vec4(xxx, yyy, 0u, 0u));
3078 }
3079 return points;
3080 }
generateOutputPvectorvkt::Reconvergence::__anone030def80111::VertexRandomProgram::Arrangement3081 static std::vector<uint32_t> generateOutputPvector(uint32_t subgroupSize, uint32_t vertexCount)
3082 {
3083 const uint32_t subgroupCount = ROUNDUP(vertexCount, subgroupSize) / subgroupSize;
3084 std::vector<uint32_t> outputP(vertexCount + INVOCATION_ENTRIES_OFFSET);
3085 outputP.at(NUM_SUBGROUPS_OFFSET) = subgroupCount;
3086 outputP.at(SUBGROUP_SIZE_OFFSET) = subgroupSize;
3087 outputP.at(INVOCATION_COUNT_OFFSET) = vertexCount;
3088 for (uint32_t vertexID = 0u; vertexID < vertexCount; ++vertexID)
3089 {
3090 const uint32_t subgroupID = vertexID / subgroupSize;
3091 const uint32_t subgroupInvocationID = vertexID % subgroupSize;
3092 outputP.at(vertexID + INVOCATION_ENTRIES_OFFSET) = ((subgroupID + 1u) << 16) | subgroupInvocationID;
3093 }
3094 return outputP;
3095 }
3096 };
3097
simulate(bool countOnly,uint32_t subgroupSize,add_ref<std::vector<uint64_t>> ref)3098 virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) override
3099 {
3100 DE_ASSERT(false); // use overloaded version of simulate() instead
3101 DE_UNREF(countOnly);
3102 DE_UNREF(subgroupSize);
3103 DE_UNREF(ref);
3104 return 0;
3105 }
3106
3107 protected:
genIf(IFType ifType,uint32_t)3108 virtual void genIf(IFType ifType, uint32_t /*maxLocalIndexCmp*/) override
3109 {
3110 RandomProgram::genIf(ifType, RandomProgram::invocationStride);
3111 }
3112
getPartitionBallotText()3113 virtual std::string getPartitionBallotText() override
3114 {
3115 return "storeValue(outLoc++, subgroupBallot(true))";
3116 }
3117
printIfLocalInvocationIndex(add_ref<std::stringstream> css,add_cref<FlowState> flow)3118 virtual void printIfLocalInvocationIndex(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
3119 {
3120 printIndent(css);
3121 css << "if (invocationIndex() >= inputA.a[0x" << std::hex << flow.ops[flow.opsIndex].value << "]) {\n";
3122 }
3123
printStore(add_ref<std::stringstream> css,add_cref<FlowState> flow)3124 virtual void printStore(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
3125 {
3126 printIndent(css);
3127 css << "storeValue(outLoc++, 0x" << std::hex << flow.ops[flow.opsIndex].value << std::dec << ");\n";
3128 }
3129
printBallot(add_ref<std::stringstream> css,add_cref<FlowState>,bool endWithSemicolon=false)3130 virtual void printBallot(add_ref<std::stringstream> css, add_cref<FlowState>,
3131 bool endWithSemicolon = false) override
3132 {
3133 printIndent(css);
3134 // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
3135 // a ballot, to make sure the ballot is "diverged enough". Don't do this for
3136 // subgroup_uniform_control_flow, since we only validate results that must be fully
3137 // reconverged.
3138 if (loopNesting > 0 && caseDef.testType == TT_MAXIMAL)
3139 {
3140 css << getPartitionBallotText();
3141 }
3142 else
3143 {
3144 css << "storeValue(outLoc++, subgroupBallot(true))";
3145 }
3146 if (endWithSemicolon)
3147 {
3148 css << ";\n";
3149 }
3150 }
3151
simulateBallot(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t unusedPrimitiveID,const int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)3152 virtual void simulateBallot(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t unusedPrimitiveID,
3153 const int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
3154 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
3155 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
3156 const OPType reason, const tcu::UVec4 *cmp) override
3157 {
3158 DE_UNREF(unusedPrimitiveID);
3159 DE_UNREF(opsIndex);
3160 add_cref<Arrangement> a(*std::static_pointer_cast<Arrangement>(prerequisites));
3161 for (uint32_t primitiveID = 0u; primitiveID < a.m_primitiveStride; ++primitiveID)
3162 {
3163 const uint32_t sgid = a.m_primitiveSubgroups.at(primitiveID);
3164 DE_ASSERT(sgid < (a.m_subgroupCount * a.m_subgroupSize));
3165 if (false == activeMask.test(Ballots::findBit(sgid, a.m_subgroupSize)))
3166 continue;
3167 const uint32_t index = (outLoc.at(primitiveID)++) * a.m_invocationStride + primitiveID;
3168 if (false == countOnly)
3169 {
3170 ref.at(index) = Ballot(activeMask.at(sgid / a.m_subgroupSize));
3171 if (cmp && logFailureCount > 0u && cmp[index] != ref.at(index))
3172 {
3173 logFailureCount -= 1u;
3174 log << tcu::TestLog::Message << logFailureCount << ": stored value mismatch from "
3175 << OPtypeToStr(reason) << tcu::TestLog::EndMessage;
3176 }
3177 }
3178 }
3179 }
3180
simulateStore(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t unusedPrimitiveID,const uint64_t storeValue,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)3181 virtual void simulateStore(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t unusedPrimitiveID,
3182 const uint64_t storeValue, add_ref<std::vector<uint32_t>> outLoc,
3183 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
3184 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
3185 const OPType reason, const tcu::UVec4 *cmp) override
3186 {
3187 DE_UNREF(unusedPrimitiveID);
3188 add_cref<Arrangement> a(*std::static_pointer_cast<Arrangement>(prerequisites));
3189 for (uint32_t primitiveID = 0u; primitiveID < a.m_primitiveStride; ++primitiveID)
3190 {
3191 const uint32_t sgid = a.m_primitiveSubgroups.at(primitiveID);
3192 DE_ASSERT(sgid < (a.m_subgroupCount * a.m_subgroupSize));
3193 if (false == activeMask.test(Ballots::findBit(sgid, a.m_subgroupSize)))
3194 continue;
3195 const uint32_t index = (outLoc.at(primitiveID)++) * a.m_invocationStride + primitiveID;
3196 if (false == countOnly)
3197 {
3198 ref.at(index) = Ballot(tcu::UVec4(uint32_t(storeValue & 0xFFFFFFFF), 0u, 0u, 0u));
3199 if (cmp && logFailureCount > 0u && cmp[index] != ref.at(index))
3200 {
3201 logFailureCount -= 1u;
3202 log << tcu::TestLog::Message << logFailureCount << ": stored value mismatch from "
3203 << OPtypeToStr(reason) << tcu::TestLog::EndMessage;
3204 }
3205 }
3206 }
3207 }
3208
makePrerequisites(add_cref<std::vector<uint32_t>> outputP,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<SubgroupState2>> stateStack,add_ref<std::vector<uint32_t>> outLoc,add_ref<uint32_t> subgroupCount)3209 virtual std::shared_ptr<Prerequisites> makePrerequisites(add_cref<std::vector<uint32_t>> outputP,
3210 const uint32_t subgroupSize, const uint32_t fragmentStride,
3211 const uint32_t primitiveStride,
3212 add_ref<std::vector<SubgroupState2>> stateStack,
3213 add_ref<std::vector<uint32_t>> outLoc,
3214 add_ref<uint32_t> subgroupCount) override
3215 {
3216 DE_UNREF(fragmentStride);
3217 auto prerequisites = std::make_shared<Arrangement>(outputP, subgroupSize, primitiveStride);
3218 subgroupCount = prerequisites->m_subgroupCount;
3219 stateStack.resize(10u, SubgroupState2(subgroupCount));
3220 outLoc.resize(primitiveStride, 0u);
3221 stateStack.at(0).activeMask = prerequisites->m_initialBallots;
3222 return prerequisites;
3223 }
3224 };
3225
3226 class TessCtrlRandomProgram : public RandomProgram
3227 {
3228 public:
TessCtrlRandomProgram(add_cref<CaseDef> c,uint32_t invocationCount)3229 TessCtrlRandomProgram(add_cref<CaseDef> c, uint32_t invocationCount) : RandomProgram(c, invocationCount)
3230 {
3231 DE_ASSERT(c.shaderStage == VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT);
3232 }
3233 virtual ~TessCtrlRandomProgram() = default;
3234
3235 static const uint32_t minSubgroupSize = 4;
3236
genIf(IFType ifType,uint32_t)3237 virtual void genIf(IFType ifType, uint32_t /*maxLocalIndexCmp*/) override
3238 {
3239 RandomProgram::genIf(ifType, std::min((minSubgroupSize * caseDef.sizeX), 64u));
3240 }
3241
printIfLocalInvocationIndex(add_ref<std::stringstream> css,add_cref<FlowState> flow)3242 virtual void printIfLocalInvocationIndex(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
3243 {
3244 printIndent(css);
3245 css << "if (";
3246 css << "((((gl_PrimitiveID * width) / gl_SubgroupSize) * gl_SubgroupSize) + gl_SubgroupInvocationID)";
3247 css << " >= inputA.a[0x" << std::hex << flow.ops[flow.opsIndex].value << "]) {\n";
3248 }
3249
printStore(add_ref<std::stringstream> css,add_cref<FlowState> flow)3250 virtual void printStore(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
3251 {
3252 printIndent(css);
3253 css << "outputC.loc[invocationIndex()]++;\n";
3254 printIndent(css);
3255 css << "outputB.b[(outLoc++) * invocationStride + invocationIndex()].x = 0x" << std::hex
3256 << flow.ops[flow.opsIndex].value << ";\n";
3257 }
3258
printBallot(add_ref<std::stringstream> css,add_cref<FlowState>,bool endWithSemicolon=false)3259 virtual void printBallot(add_ref<std::stringstream> css, add_cref<FlowState>,
3260 bool endWithSemicolon = false) override
3261 {
3262 printIndent(css);
3263
3264 css << "outputC.loc[invocationIndex()]++,";
3265 // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
3266 // a ballot, to make sure the ballot is "diverged enough". Don't do this for
3267 // subgroup_uniform_control_flow, since we only validate results that must be fully
3268 // reconverged.
3269 if (loopNesting > 0 && caseDef.testType == TT_MAXIMAL)
3270 {
3271 css << "outputB.b[(outLoc++) * invocationStride + invocationIndex()] = " << getPartitionBallotText()
3272 << ".xy";
3273 }
3274 else
3275 {
3276 css << "outputB.b[(outLoc++) * invocationStride + invocationIndex()] = subgroupBallot(true).xy";
3277 }
3278 if (endWithSemicolon)
3279 {
3280 css << ";\n";
3281 }
3282 }
3283
simulateStoreToChange(bool countOnly,uint32_t,const SubgroupState (& stateStack)[10],int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<uint64_t>> ref)3284 void simulateStoreToChange(bool countOnly, uint32_t /*subgroupSize*/, const SubgroupState (&stateStack)[10],
3285 int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
3286 add_ref<std::vector<uint64_t>> ref)
3287 {
3288 for (uint32_t id = 0; id < invocationStride; ++id)
3289 {
3290 if (stateStack[nesting].activeMask.test(id))
3291 {
3292 if (countOnly)
3293 outLoc[id]++;
3294 else
3295 ref[(outLoc[id]++) * invocationStride + id] = ops[opsIndex].value;
3296 }
3297 }
3298 }
3299
simulateBallotToChange(bool countOnly,uint32_t subgroupSize,const SubgroupState (& stateStack)[10],uint32_t,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<uint64_t>> ref)3300 void simulateBallotToChange(bool countOnly, uint32_t subgroupSize, const SubgroupState (&stateStack)[10],
3301 uint32_t /*opsIndex*/, add_ref<std::vector<uint32_t>> outLoc,
3302 add_ref<std::vector<uint64_t>> ref)
3303 {
3304 for (uint32_t id = 0; id < invocationStride; ++id)
3305 {
3306 if (stateStack[nesting].activeMask.test(id))
3307 {
3308 if (countOnly)
3309 outLoc[id]++;
3310 else
3311 ref[(outLoc[id]++) * invocationStride + id] =
3312 bitsetToU64(stateStack[nesting].activeMask, subgroupSize, id);
3313 }
3314 }
3315 }
3316
3317 // Simulate execution of the program. If countOnly is true, just return
3318 // the max number of outputs written. If it's false, store out the result
3319 // values to ref.
simulate(bool countOnly,uint32_t subgroupSize,add_ref<std::vector<uint64_t>> ref)3320 virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) override
3321 {
3322 SubgroupState stateStack[10];
3323 deMemset(&stateStack, 0, sizeof(stateStack));
3324
3325 // Per-invocation output location counters
3326 std::vector<uint32_t> outLoc(invocationStride, 0u);
3327
3328 nesting = 0;
3329 loopNesting = 0;
3330
3331 for (uint32_t k = 0; k < invocationStride; ++k)
3332 stateStack[nesting].activeMask.set(k);
3333
3334 int32_t i = 0;
3335 while (i < (int32_t)ops.size())
3336 {
3337 switch (ops[i].type)
3338 {
3339 case OP_BALLOT:
3340 simulateBallotToChange(countOnly, subgroupSize, stateStack, i, outLoc, ref);
3341 break;
3342 case OP_STORE:
3343 simulateStoreToChange(countOnly, subgroupSize, stateStack, i, outLoc, ref);
3344 break;
3345 case OP_IF_MASK:
3346 nesting++;
3347 stateStack[nesting].activeMask =
3348 stateStack[nesting - 1].activeMask & bitsetFromU64(ops[i].value, subgroupSize);
3349 stateStack[nesting].header = i;
3350 stateStack[nesting].isLoop = 0;
3351 stateStack[nesting].isSwitch = 0;
3352 break;
3353 case OP_ELSE_MASK:
3354 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask &
3355 ~bitsetFromU64(ops[stateStack[nesting].header].value, subgroupSize);
3356 break;
3357 case OP_IF_LOOPCOUNT:
3358 {
3359 uint32_t n = nesting;
3360 while (!stateStack[n].isLoop)
3361 n--;
3362
3363 nesting++;
3364 stateStack[nesting].activeMask =
3365 stateStack[nesting - 1].activeMask & bitsetFromU64((1ULL << stateStack[n].tripCount), subgroupSize);
3366 stateStack[nesting].header = i;
3367 stateStack[nesting].isLoop = 0;
3368 stateStack[nesting].isSwitch = 0;
3369 break;
3370 }
3371 case OP_ELSE_LOOPCOUNT:
3372 {
3373 uint32_t n = nesting;
3374 while (!stateStack[n].isLoop)
3375 n--;
3376
3377 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask &
3378 ~bitsetFromU64((1ULL << stateStack[n].tripCount), subgroupSize);
3379 break;
3380 }
3381 case OP_IF_LOCAL_INVOCATION_INDEX: // TessCtrlRandomProgram
3382 {
3383 // all bits >= N
3384 bitset_inv_t mask;
3385 for (uint32_t j = static_cast<uint32_t>(ops[i].value); j < invocationStride; ++j)
3386 mask.set(j);
3387
3388 nesting++;
3389 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask & mask;
3390 stateStack[nesting].header = i;
3391 stateStack[nesting].isLoop = 0;
3392 stateStack[nesting].isSwitch = 0;
3393 break;
3394 }
3395 case OP_ELSE_LOCAL_INVOCATION_INDEX: // TessCtrlRandomProgram
3396 {
3397 // all bits < N
3398 bitset_inv_t mask;
3399 for (uint32_t j = 0; j < static_cast<uint32_t>(ops[i].value); ++j)
3400 mask.set(j);
3401
3402 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask & mask;
3403 break;
3404 }
3405 case OP_ENDIF:
3406 nesting--;
3407 break;
3408 case OP_BEGIN_FOR_UNIF:
3409 // XXX TODO: We don't handle a for loop with zero iterations
3410 nesting++;
3411 loopNesting++;
3412 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3413 stateStack[nesting].header = i;
3414 stateStack[nesting].tripCount = 0;
3415 stateStack[nesting].isLoop = 1;
3416 stateStack[nesting].isSwitch = 0;
3417 stateStack[nesting].continueMask = 0;
3418 break;
3419 case OP_END_FOR_UNIF:
3420 stateStack[nesting].tripCount++;
3421 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3422 stateStack[nesting].continueMask = 0;
3423 if (stateStack[nesting].tripCount < ops[stateStack[nesting].header].value &&
3424 stateStack[nesting].activeMask.any())
3425 {
3426 i = stateStack[nesting].header + 1;
3427 continue;
3428 }
3429 else
3430 {
3431 loopNesting--;
3432 nesting--;
3433 }
3434 break;
3435 case OP_BEGIN_DO_WHILE_UNIF:
3436 // XXX TODO: We don't handle a for loop with zero iterations
3437 nesting++;
3438 loopNesting++;
3439 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3440 stateStack[nesting].header = i;
3441 stateStack[nesting].tripCount = 1;
3442 stateStack[nesting].isLoop = 1;
3443 stateStack[nesting].isSwitch = 0;
3444 stateStack[nesting].continueMask = 0;
3445 break;
3446 case OP_END_DO_WHILE_UNIF:
3447 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3448 stateStack[nesting].continueMask = 0;
3449 if (stateStack[nesting].tripCount < ops[stateStack[nesting].header].value &&
3450 stateStack[nesting].activeMask.any())
3451 {
3452 i = stateStack[nesting].header + 1;
3453 stateStack[nesting].tripCount++;
3454 continue;
3455 }
3456 else
3457 {
3458 loopNesting--;
3459 nesting--;
3460 }
3461 break;
3462 case OP_BEGIN_FOR_VAR:
3463 // XXX TODO: We don't handle a for loop with zero iterations
3464 nesting++;
3465 loopNesting++;
3466 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3467 stateStack[nesting].header = i;
3468 stateStack[nesting].tripCount = 0;
3469 stateStack[nesting].isLoop = 1;
3470 stateStack[nesting].isSwitch = 0;
3471 stateStack[nesting].continueMask = 0;
3472 break;
3473 case OP_END_FOR_VAR:
3474 stateStack[nesting].tripCount++;
3475 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3476 stateStack[nesting].continueMask = 0;
3477 stateStack[nesting].activeMask &= bitsetFromU64(stateStack[nesting].tripCount == subgroupSize ?
3478 0 :
3479 ~((1ULL << (stateStack[nesting].tripCount)) - 1),
3480 subgroupSize);
3481 if (stateStack[nesting].activeMask.any())
3482 {
3483 i = stateStack[nesting].header + 1;
3484 continue;
3485 }
3486 else
3487 {
3488 loopNesting--;
3489 nesting--;
3490 }
3491 break;
3492 case OP_BEGIN_FOR_INF:
3493 case OP_BEGIN_DO_WHILE_INF:
3494 nesting++;
3495 loopNesting++;
3496 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3497 stateStack[nesting].header = i;
3498 stateStack[nesting].tripCount = 0;
3499 stateStack[nesting].isLoop = 1;
3500 stateStack[nesting].isSwitch = 0;
3501 stateStack[nesting].continueMask = 0;
3502 break;
3503 case OP_END_FOR_INF:
3504 stateStack[nesting].tripCount++;
3505 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3506 stateStack[nesting].continueMask = 0;
3507 if (stateStack[nesting].activeMask.any())
3508 {
3509 // output expected OP_BALLOT values
3510 simulateBallotToChange(countOnly, subgroupSize, stateStack, i, outLoc, ref);
3511
3512 i = stateStack[nesting].header + 1;
3513 continue;
3514 }
3515 else
3516 {
3517 loopNesting--;
3518 nesting--;
3519 }
3520 break;
3521 case OP_END_DO_WHILE_INF:
3522 stateStack[nesting].tripCount++;
3523 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3524 stateStack[nesting].continueMask = 0;
3525 if (stateStack[nesting].activeMask.any())
3526 {
3527 i = stateStack[nesting].header + 1;
3528 continue;
3529 }
3530 else
3531 {
3532 loopNesting--;
3533 nesting--;
3534 }
3535 break;
3536 case OP_BREAK:
3537 {
3538 uint32_t n = nesting;
3539 bitset_inv_t mask = stateStack[nesting].activeMask;
3540 while (true)
3541 {
3542 stateStack[n].activeMask &= ~mask;
3543 if (stateStack[n].isLoop || stateStack[n].isSwitch)
3544 break;
3545
3546 n--;
3547 }
3548 }
3549 break;
3550 case OP_CONTINUE:
3551 {
3552 uint32_t n = nesting;
3553 bitset_inv_t mask = stateStack[nesting].activeMask;
3554 while (true)
3555 {
3556 stateStack[n].activeMask &= ~mask;
3557 if (stateStack[n].isLoop)
3558 {
3559 stateStack[n].continueMask |= mask;
3560 break;
3561 }
3562 n--;
3563 }
3564 }
3565 break;
3566 case OP_ELECT:
3567 {
3568 nesting++;
3569 stateStack[nesting].activeMask = bitsetElect(stateStack[nesting - 1].activeMask, subgroupSize);
3570 stateStack[nesting].header = i;
3571 stateStack[nesting].isLoop = 0;
3572 stateStack[nesting].isSwitch = 0;
3573 }
3574 break;
3575 case OP_RETURN:
3576 {
3577 bitset_inv_t mask = stateStack[nesting].activeMask;
3578 for (int32_t n = nesting; n >= 0; --n)
3579 {
3580 stateStack[n].activeMask &= ~mask;
3581 if (stateStack[n].isCall)
3582 break;
3583 }
3584 }
3585 break;
3586
3587 case OP_CALL_BEGIN:
3588 nesting++;
3589 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3590 stateStack[nesting].isLoop = 0;
3591 stateStack[nesting].isSwitch = 0;
3592 stateStack[nesting].isCall = 1;
3593 break;
3594 case OP_CALL_END:
3595 stateStack[nesting].isCall = 0;
3596 nesting--;
3597 break;
3598 case OP_NOISE:
3599 break;
3600
3601 case OP_SWITCH_UNIF_BEGIN:
3602 case OP_SWITCH_VAR_BEGIN:
3603 case OP_SWITCH_LOOP_COUNT_BEGIN:
3604 nesting++;
3605 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3606 stateStack[nesting].header = i;
3607 stateStack[nesting].isLoop = 0;
3608 stateStack[nesting].isSwitch = 1;
3609 break;
3610 case OP_SWITCH_END:
3611 nesting--;
3612 break;
3613 case OP_CASE_MASK_BEGIN:
3614 stateStack[nesting].activeMask =
3615 stateStack[nesting - 1].activeMask & bitsetFromU64(ops[i].value, subgroupSize);
3616 break;
3617 case OP_CASE_LOOP_COUNT_BEGIN:
3618 {
3619 uint32_t n = nesting;
3620 uint32_t l = loopNesting;
3621
3622 while (true)
3623 {
3624 if (stateStack[n].isLoop)
3625 {
3626 l--;
3627 if (l == ops[stateStack[nesting].header].value)
3628 break;
3629 }
3630 n--;
3631 }
3632
3633 if ((1ULL << stateStack[n].tripCount) & ops[i].value)
3634 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3635 else
3636 stateStack[nesting].activeMask = 0;
3637 break;
3638 }
3639 case OP_CASE_END:
3640 break;
3641
3642 default:
3643 DE_ASSERT(0);
3644 break;
3645 }
3646 i++;
3647 }
3648 uint32_t maxLoc = 0;
3649 for (uint32_t id = 0; id < (uint32_t)outLoc.size(); ++id)
3650 maxLoc = de::max(maxLoc, outLoc[id]);
3651
3652 return maxLoc;
3653 }
3654 };
3655
3656 class TessEvalRandomProgram : public RandomProgram
3657 {
3658 public:
TessEvalRandomProgram(add_cref<CaseDef> c,uint32_t invocationCount=0)3659 TessEvalRandomProgram(add_cref<CaseDef> c, uint32_t invocationCount = 0)
3660 : RandomProgram(c, (invocationCount ? invocationCount : 64))
3661 , ifLocalInvocationIndexAsSubgroupInvocationID(false)
3662 {
3663 DE_ASSERT(c.shaderStage == VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT);
3664 }
3665 virtual ~TessEvalRandomProgram() = default;
3666
3667 const bool ifLocalInvocationIndexAsSubgroupInvocationID;
3668 static const uint32_t quadInvocationCount = 4;
3669
3670 // Simulate execution of the program. If countOnly is true, just return
3671 // the max number of outputs written. If it's false, store out the result
3672 // values to ref.
simulate(bool countOnly,uint32_t subgroupSize,add_ref<std::vector<uint64_t>> ref)3673 virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) override
3674 {
3675 SubgroupState stateStack[10];
3676 deMemset(&stateStack, 0, sizeof(stateStack));
3677
3678 // Per-invocation output location counters
3679 std::vector<uint32_t> outLoc(invocationStride, 0u);
3680
3681 nesting = 0;
3682 loopNesting = 0;
3683
3684 for (uint32_t k = 0; k < invocationStride; ++k)
3685 stateStack[nesting].activeMask.set(k);
3686
3687 int32_t i = 0;
3688 while (i < (int32_t)ops.size())
3689 {
3690 switch (ops[i].type)
3691 {
3692 case OP_BALLOT:
3693 simulateBallotToChange(countOnly, subgroupSize, stateStack, i, outLoc, ref);
3694 break;
3695 case OP_STORE:
3696 simulateStoreToChange(countOnly, subgroupSize, stateStack, i, outLoc, ref);
3697 break;
3698 case OP_IF_MASK:
3699 nesting++;
3700 stateStack[nesting].activeMask =
3701 stateStack[nesting - 1].activeMask & bitsetFromU64(ops[i].value, subgroupSize);
3702 stateStack[nesting].header = i;
3703 stateStack[nesting].isLoop = 0;
3704 stateStack[nesting].isSwitch = 0;
3705 break;
3706 case OP_ELSE_MASK:
3707 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask &
3708 ~bitsetFromU64(ops[stateStack[nesting].header].value, subgroupSize);
3709 break;
3710 case OP_IF_LOOPCOUNT:
3711 {
3712 uint32_t n = nesting;
3713 while (!stateStack[n].isLoop)
3714 n--;
3715
3716 nesting++;
3717 stateStack[nesting].activeMask =
3718 stateStack[nesting - 1].activeMask & bitsetFromU64((1ULL << stateStack[n].tripCount), subgroupSize);
3719 stateStack[nesting].header = i;
3720 stateStack[nesting].isLoop = 0;
3721 stateStack[nesting].isSwitch = 0;
3722 break;
3723 }
3724 case OP_ELSE_LOOPCOUNT:
3725 {
3726 uint32_t n = nesting;
3727 while (!stateStack[n].isLoop)
3728 n--;
3729
3730 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask &
3731 ~bitsetFromU64((1ULL << stateStack[n].tripCount), subgroupSize);
3732 break;
3733 }
3734 case OP_IF_LOCAL_INVOCATION_INDEX: // TessEvalRandomProgram
3735 {
3736 bitset_inv_t mask;
3737 if (ifLocalInvocationIndexAsSubgroupInvocationID)
3738 {
3739 // if (gl_SubgroupInvocationID >= value), all bits >= N
3740 for (uint32_t j = static_cast<uint32_t>(ops[i].value); j < subgroupSize; ++j)
3741 mask.set(j);
3742 mask = bitsetFromU64(mask.to_ullong(), subgroupSize);
3743 }
3744 else
3745 {
3746 // all bits >= N
3747 for (uint32_t j = (uint32_t)ops[i].value; j < invocationStride; ++j)
3748 mask.set(j);
3749 }
3750
3751 nesting++;
3752 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask & mask;
3753 stateStack[nesting].header = i;
3754 stateStack[nesting].isLoop = 0;
3755 stateStack[nesting].isSwitch = 0;
3756 break;
3757 }
3758 case OP_ELSE_LOCAL_INVOCATION_INDEX: // TessEvalRandomProgram
3759 {
3760 // all bits < N
3761 bitset_inv_t mask;
3762 for (uint32_t j = 0; j < static_cast<uint32_t>(ops[i].value); ++j)
3763 mask.set(j);
3764
3765 if (ifLocalInvocationIndexAsSubgroupInvocationID)
3766 {
3767 // else (gl_SubgroupInvocationID >= value), all bits < N
3768 mask = bitsetFromU64(mask.to_ullong(), subgroupSize);
3769 }
3770
3771 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask & mask;
3772 break;
3773 }
3774 case OP_ENDIF:
3775 nesting--;
3776 break;
3777 case OP_BEGIN_FOR_UNIF:
3778 // XXX TODO: We don't handle a for loop with zero iterations
3779 nesting++;
3780 loopNesting++;
3781 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3782 stateStack[nesting].header = i;
3783 stateStack[nesting].tripCount = 0;
3784 stateStack[nesting].isLoop = 1;
3785 stateStack[nesting].isSwitch = 0;
3786 stateStack[nesting].continueMask = 0;
3787 break;
3788 case OP_END_FOR_UNIF:
3789 stateStack[nesting].tripCount++;
3790 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3791 stateStack[nesting].continueMask = 0;
3792 if (stateStack[nesting].tripCount < ops[stateStack[nesting].header].value &&
3793 stateStack[nesting].activeMask.any())
3794 {
3795 i = stateStack[nesting].header + 1;
3796 continue;
3797 }
3798 else
3799 {
3800 loopNesting--;
3801 nesting--;
3802 }
3803 break;
3804 case OP_BEGIN_DO_WHILE_UNIF:
3805 // XXX TODO: We don't handle a for loop with zero iterations
3806 nesting++;
3807 loopNesting++;
3808 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3809 stateStack[nesting].header = i;
3810 stateStack[nesting].tripCount = 1;
3811 stateStack[nesting].isLoop = 1;
3812 stateStack[nesting].isSwitch = 0;
3813 stateStack[nesting].continueMask = 0;
3814 break;
3815 case OP_END_DO_WHILE_UNIF:
3816 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3817 stateStack[nesting].continueMask = 0;
3818 if (stateStack[nesting].tripCount < ops[stateStack[nesting].header].value &&
3819 stateStack[nesting].activeMask.any())
3820 {
3821 i = stateStack[nesting].header + 1;
3822 stateStack[nesting].tripCount++;
3823 continue;
3824 }
3825 else
3826 {
3827 loopNesting--;
3828 nesting--;
3829 }
3830 break;
3831 case OP_BEGIN_FOR_VAR:
3832 // XXX TODO: We don't handle a for loop with zero iterations
3833 nesting++;
3834 loopNesting++;
3835 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3836 stateStack[nesting].header = i;
3837 stateStack[nesting].tripCount = 0;
3838 stateStack[nesting].isLoop = 1;
3839 stateStack[nesting].isSwitch = 0;
3840 stateStack[nesting].continueMask = 0;
3841 break;
3842 case OP_END_FOR_VAR:
3843 stateStack[nesting].tripCount++;
3844 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3845 stateStack[nesting].continueMask = 0;
3846 stateStack[nesting].activeMask &= bitsetFromU64(stateStack[nesting].tripCount == subgroupSize ?
3847 0 :
3848 ~((1ULL << (stateStack[nesting].tripCount)) - 1),
3849 subgroupSize);
3850 if (stateStack[nesting].activeMask.any())
3851 {
3852 i = stateStack[nesting].header + 1;
3853 continue;
3854 }
3855 else
3856 {
3857 loopNesting--;
3858 nesting--;
3859 }
3860 break;
3861 case OP_BEGIN_FOR_INF:
3862 case OP_BEGIN_DO_WHILE_INF:
3863 nesting++;
3864 loopNesting++;
3865 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3866 stateStack[nesting].header = i;
3867 stateStack[nesting].tripCount = 0;
3868 stateStack[nesting].isLoop = 1;
3869 stateStack[nesting].isSwitch = 0;
3870 stateStack[nesting].continueMask = 0;
3871 break;
3872 case OP_END_FOR_INF:
3873 stateStack[nesting].tripCount++;
3874 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3875 stateStack[nesting].continueMask = 0;
3876 if (stateStack[nesting].activeMask.any())
3877 {
3878 // output expected OP_BALLOT values
3879 simulateBallotToChange(countOnly, subgroupSize, stateStack, i, outLoc, ref);
3880
3881 i = stateStack[nesting].header + 1;
3882 continue;
3883 }
3884 else
3885 {
3886 loopNesting--;
3887 nesting--;
3888 }
3889 break;
3890 case OP_END_DO_WHILE_INF:
3891 stateStack[nesting].tripCount++;
3892 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3893 stateStack[nesting].continueMask = 0;
3894 if (stateStack[nesting].activeMask.any())
3895 {
3896 i = stateStack[nesting].header + 1;
3897 continue;
3898 }
3899 else
3900 {
3901 loopNesting--;
3902 nesting--;
3903 }
3904 break;
3905 case OP_BREAK:
3906 {
3907 uint32_t n = nesting;
3908 bitset_inv_t mask = stateStack[nesting].activeMask;
3909 while (true)
3910 {
3911 stateStack[n].activeMask &= ~mask;
3912 if (stateStack[n].isLoop || stateStack[n].isSwitch)
3913 break;
3914
3915 n--;
3916 }
3917 }
3918 break;
3919 case OP_CONTINUE:
3920 {
3921 uint32_t n = nesting;
3922 bitset_inv_t mask = stateStack[nesting].activeMask;
3923 while (true)
3924 {
3925 stateStack[n].activeMask &= ~mask;
3926 if (stateStack[n].isLoop)
3927 {
3928 stateStack[n].continueMask |= mask;
3929 break;
3930 }
3931 n--;
3932 }
3933 }
3934 break;
3935 case OP_ELECT:
3936 {
3937 nesting++;
3938 stateStack[nesting].activeMask = bitsetElect(stateStack[nesting - 1].activeMask, subgroupSize);
3939 stateStack[nesting].header = i;
3940 stateStack[nesting].isLoop = 0;
3941 stateStack[nesting].isSwitch = 0;
3942 }
3943 break;
3944 case OP_RETURN:
3945 {
3946 bitset_inv_t mask = stateStack[nesting].activeMask;
3947 for (int32_t n = nesting; n >= 0; --n)
3948 {
3949 stateStack[n].activeMask &= ~mask;
3950 if (stateStack[n].isCall)
3951 break;
3952 }
3953 }
3954 break;
3955
3956 case OP_CALL_BEGIN:
3957 nesting++;
3958 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3959 stateStack[nesting].isLoop = 0;
3960 stateStack[nesting].isSwitch = 0;
3961 stateStack[nesting].isCall = 1;
3962 break;
3963 case OP_CALL_END:
3964 stateStack[nesting].isCall = 0;
3965 nesting--;
3966 break;
3967 case OP_NOISE:
3968 break;
3969
3970 case OP_SWITCH_UNIF_BEGIN:
3971 case OP_SWITCH_VAR_BEGIN:
3972 case OP_SWITCH_LOOP_COUNT_BEGIN:
3973 nesting++;
3974 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3975 stateStack[nesting].header = i;
3976 stateStack[nesting].isLoop = 0;
3977 stateStack[nesting].isSwitch = 1;
3978 break;
3979 case OP_SWITCH_END:
3980 nesting--;
3981 break;
3982 case OP_CASE_MASK_BEGIN:
3983 stateStack[nesting].activeMask =
3984 stateStack[nesting - 1].activeMask & bitsetFromU64(ops[i].value, subgroupSize);
3985 break;
3986 case OP_CASE_LOOP_COUNT_BEGIN:
3987 {
3988 uint32_t n = nesting;
3989 uint32_t l = loopNesting;
3990
3991 while (true)
3992 {
3993 if (stateStack[n].isLoop)
3994 {
3995 l--;
3996 if (l == ops[stateStack[nesting].header].value)
3997 break;
3998 }
3999 n--;
4000 }
4001
4002 if ((1ULL << stateStack[n].tripCount) & ops[i].value)
4003 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
4004 else
4005 stateStack[nesting].activeMask = 0;
4006 break;
4007 }
4008 case OP_CASE_END:
4009 break;
4010
4011 default:
4012 DE_ASSERT(0);
4013 break;
4014 }
4015 i++;
4016 }
4017 uint32_t maxLoc = 0;
4018 for (uint32_t id = 0; id < (uint32_t)outLoc.size(); ++id)
4019 maxLoc = de::max(maxLoc, outLoc[id]);
4020
4021 return maxLoc;
4022 }
4023
4024 protected:
genIf(IFType ifType,uint32_t)4025 virtual void genIf(IFType ifType, uint32_t /*maxLocalIndexCmp*/) override
4026 {
4027 RandomProgram::genIf(ifType, std::min(64u, (caseDef.sizeX * quadInvocationCount - 1)));
4028 }
4029
printIfLocalInvocationIndex(add_ref<std::stringstream> css,add_cref<FlowState> flow)4030 virtual void printIfLocalInvocationIndex(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
4031 {
4032 // uint invocationIndex() { return gl_PrimitiveID * width + gl_SubgroupInvocationID; }
4033 printIndent(css);
4034 css << "if (";
4035 if (ifLocalInvocationIndexAsSubgroupInvocationID)
4036 css << "gl_SubgroupInvocationID";
4037 else
4038 css << "((((gl_PrimitiveID * width) / gl_SubgroupSize) * gl_SubgroupSize) + gl_SubgroupInvocationID)";
4039 css << " >= inputA.a[0x" << std::hex << flow.ops[flow.opsIndex].value << "]) {\n";
4040 }
4041
printStore(add_ref<std::stringstream> css,add_cref<FlowState> flow)4042 virtual void printStore(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
4043 {
4044 printIndent(css);
4045 css << "outputC.loc[invocationIndex()]++;\n";
4046 printIndent(css);
4047 css << "outputB.b[(outLoc++)*invocationStride + invocationIndex()].x = 0x" << std::hex
4048 << flow.ops[flow.opsIndex].value << ";\n";
4049 }
4050
printBallot(add_ref<std::stringstream> css,add_cref<FlowState>,bool endWithSemicolon=false)4051 virtual void printBallot(add_ref<std::stringstream> css, add_cref<FlowState>,
4052 bool endWithSemicolon = false) override
4053 {
4054 printIndent(css);
4055
4056 css << "outputC.loc[invocationIndex()]++,";
4057 // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
4058 // a ballot, to make sure the ballot is "diverged enough". Don't do this for
4059 // subgroup_uniform_control_flow, since we only validate results that must be fully
4060 // reconverged.
4061 if (loopNesting > 0 && caseDef.testType == TT_MAXIMAL)
4062 {
4063 css << "outputB.b[(outLoc++)*invocationStride + invocationIndex()] = " << getPartitionBallotText() << ".xy";
4064 }
4065 else
4066 {
4067 css << "outputB.b[(outLoc++)*invocationStride + invocationIndex()] = subgroupBallot(true).xy";
4068 }
4069 if (endWithSemicolon)
4070 {
4071 css << ";\n";
4072 }
4073 }
4074
simulateStoreToChange(bool countOnly,uint32_t,const SubgroupState (& stateStack)[10],int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<uint64_t>> ref)4075 void simulateStoreToChange(bool countOnly, uint32_t /*subgroupSize*/, const SubgroupState (&stateStack)[10],
4076 int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
4077 add_ref<std::vector<uint64_t>> ref)
4078 {
4079 for (uint32_t id = 0; id < invocationStride; ++id)
4080 {
4081 if (stateStack[nesting].activeMask.test(id))
4082 {
4083 if (countOnly)
4084 outLoc[id]++;
4085 else
4086 ref[(outLoc[id]++) * invocationStride + id] = ops[opsIndex].value;
4087 }
4088 }
4089 }
4090
simulateBallotToChange(bool countOnly,uint32_t subgroupSize,const SubgroupState (& stateStack)[10],uint32_t,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<uint64_t>> ref)4091 void simulateBallotToChange(bool countOnly, uint32_t subgroupSize, const SubgroupState (&stateStack)[10],
4092 uint32_t /*opsIndex*/, add_ref<std::vector<uint32_t>> outLoc,
4093 add_ref<std::vector<uint64_t>> ref)
4094 {
4095 for (uint32_t id = 0; id < invocationStride; ++id)
4096 {
4097 if (stateStack[nesting].activeMask.test(id))
4098 {
4099 if (countOnly)
4100 outLoc[id]++;
4101 else
4102 ref[(outLoc[id]++) * invocationStride + id] =
4103 bitsetToU64(stateStack[nesting].activeMask, subgroupSize, id);
4104 }
4105 }
4106 }
4107 };
4108
4109 class GeometryRandomProgram : public RandomProgram
4110 {
4111 public:
4112 static const constexpr uint32_t fillPercentage = 71u;
GeometryRandomProgram(add_cref<CaseDef> c)4113 GeometryRandomProgram(add_cref<CaseDef> c)
4114 : RandomProgram(c, Arrangement::calculatePrimitiveCount(c.sizeX, c.sizeY, fillPercentage))
4115 {
4116 DE_ASSERT(c.shaderStage == VK_SHADER_STAGE_GEOMETRY_BIT);
4117 }
4118 virtual ~GeometryRandomProgram() = default;
4119
4120 struct Arrangement : Prerequisites
4121 {
4122 static constexpr uint32_t NUM_SUBGROUPS_OFFSET = 0u;
4123 static constexpr uint32_t SUBGROUP_SIZE_OFFSET = 1u;
4124 static constexpr uint32_t INVOCATION_COUNT_OFFSET = 2u;
4125 static constexpr uint32_t MAX_LOC_OFFSET = 3u;
4126 static constexpr uint32_t MAX_IDENTITY_OFFSET = 4u;
4127 static constexpr uint32_t INVOCATION_ENTRY_OFFSET = 5u;
4128
4129 const uint32_t m_shaderSubgroupSize;
4130 const uint32_t m_shaderSubgroupCount;
4131 const uint32_t m_shaderInvocationCount;
4132 const uint32_t m_shaderMaxLoc;
4133 const uint32_t m_shaderMaxIdentity;
4134
4135 const uint32_t m_subgroupSize;
4136 const uint32_t m_primitiveStride;
4137 const uint32_t m_invocationStride;
4138 const uint32_t m_subgroupCount;
4139 const Ballots m_initialBallots;
4140 const std::vector<uint32_t> m_primitiveSubgroups;
4141
Arrangementvkt::Reconvergence::__anone030def80111::GeometryRandomProgram::Arrangement4142 Arrangement(add_cref<std::vector<uint32_t>> outputP, uint32_t subgroupSize, uint32_t primitiveStride)
4143 : m_shaderSubgroupSize(outputP.at(SUBGROUP_SIZE_OFFSET))
4144 , m_shaderSubgroupCount(outputP.at(NUM_SUBGROUPS_OFFSET))
4145 , m_shaderInvocationCount(outputP.at(INVOCATION_COUNT_OFFSET))
4146 , m_shaderMaxLoc(outputP.at(MAX_LOC_OFFSET))
4147 , m_shaderMaxIdentity(outputP.at(MAX_IDENTITY_OFFSET))
4148 , m_subgroupSize(subgroupSize)
4149 , m_primitiveStride(primitiveStride)
4150 , m_invocationStride(primitiveStride)
4151 , m_subgroupCount(ROUNDUP(primitiveStride, subgroupSize) / subgroupSize)
4152 , m_initialBallots(makeInitialBallots(outputP))
4153 , m_primitiveSubgroups(makePrimitiveSubgroups(outputP))
4154 {
4155 }
makeInitialBallotsvkt::Reconvergence::__anone030def80111::GeometryRandomProgram::Arrangement4156 static Ballots makeInitialBallots(add_cref<std::vector<uint32_t>> outputP)
4157 {
4158 const uint32_t subgroupCount = outputP.at(NUM_SUBGROUPS_OFFSET);
4159 const uint32_t subgroupSize = outputP.at(SUBGROUP_SIZE_OFFSET);
4160 DE_UNREF(subgroupSize);
4161 const uint32_t primitiveStride = outputP.at(INVOCATION_COUNT_OFFSET);
4162 Ballots b(subgroupCount);
4163 for (uint32_t primitiveID = 0u; primitiveID < primitiveStride; ++primitiveID)
4164 {
4165 const uint32_t id = outputP.at(primitiveID + INVOCATION_ENTRY_OFFSET);
4166 if (id)
4167 {
4168 const uint32_t subgroupID = (id >> 16) - 1u;
4169 const uint32_t subgroupInvocationID = id & 0xFFFF;
4170 DE_ASSERT(subgroupID < subgroupCount);
4171 DE_ASSERT(subgroupInvocationID < subgroupSize);
4172 b.at(subgroupID).set(subgroupInvocationID);
4173 }
4174 }
4175 return b;
4176 }
makePrimitiveSubgroupsvkt::Reconvergence::__anone030def80111::GeometryRandomProgram::Arrangement4177 static std::vector<uint32_t> makePrimitiveSubgroups(add_cref<std::vector<uint32_t>> outputP)
4178 {
4179 const uint32_t subgroupSize = outputP.at(SUBGROUP_SIZE_OFFSET);
4180 const uint32_t primitiveStride = outputP.at(INVOCATION_COUNT_OFFSET);
4181 std::vector<uint32_t> map(primitiveStride);
4182 for (uint32_t primitiveID = 0u; primitiveID < primitiveStride; ++primitiveID)
4183 {
4184 const uint32_t id = outputP.at(primitiveID + INVOCATION_ENTRY_OFFSET);
4185 if (id)
4186 {
4187 const uint32_t subgroupID = (id >> 16) - 1u;
4188 const uint32_t subgroupInvocationID = id & 0xFFFF;
4189 DE_ASSERT(subgroupInvocationID < subgroupSize);
4190 map.at(primitiveID) = subgroupID * subgroupSize + subgroupInvocationID;
4191 }
4192 }
4193 return map;
4194 }
calculatePrimitiveCountvkt::Reconvergence::__anone030def80111::GeometryRandomProgram::Arrangement4195 static uint32_t calculatePrimitiveCount(uint32_t width, uint32_t height, uint32_t fillPercent)
4196 {
4197 deRandom rnd;
4198 std::map<uint32_t, int> map;
4199 std::vector<tcu::Vec4> points;
4200 const uint32_t frags = (width * height);
4201 const uint32_t total = (frags * fillPercent) / 100u;
4202
4203 deRandom_init(&rnd, (width * height));
4204
4205 for (uint32_t i = 0u; i < total; ++i)
4206 {
4207 const uint32_t r = deRandom_getUint32(&rnd) % frags;
4208 if (map[r] != 0)
4209 {
4210 i -= 1;
4211 continue;
4212 }
4213 map[r] = 1;
4214 }
4215
4216 return static_cast<uint32_t>(map.size());
4217 }
generatePrimitivesvkt::Reconvergence::__anone030def80111::GeometryRandomProgram::Arrangement4218 static std::vector<tcu::Vec4> generatePrimitives(uint32_t width, uint32_t height, uint32_t fillPercent)
4219 {
4220 deRandom rnd;
4221 std::map<uint32_t, int> map;
4222 std::vector<tcu::Vec4> points;
4223 const uint32_t frags = (width * height);
4224 const uint32_t total = (frags * fillPercent) / 100u;
4225
4226 deRandom_init(&rnd, (width * height));
4227
4228 for (uint32_t i = 0u; i < total; ++i)
4229 {
4230 const uint32_t r = deRandom_getUint32(&rnd) % frags;
4231 if (map[r] != 0)
4232 {
4233 i -= 1;
4234 continue;
4235 }
4236 map[r] = 1;
4237
4238 uint32_t y = r / width;
4239 uint32_t x = r % width;
4240 float xx = (float(x) + float(x + 1)) / (2.0f * float(width));
4241 float yy = (float(y) + float(y + 1)) / (2.0f * float(height));
4242 float xxx = xx * 2.0f - 1.0f;
4243 float yyy = yy * 2.0f - 1.0f;
4244 points.emplace_back(tcu::Vec4(xxx, yyy, 0u, 0u));
4245 }
4246 return points;
4247 }
generateVectorOutputPvkt::Reconvergence::__anone030def80111::GeometryRandomProgram::Arrangement4248 static std::vector<uint32_t> generateVectorOutputP(uint32_t subgroupSize, uint32_t primitiveStride)
4249 {
4250 const uint32_t subgroupCount = ROUNDUP(primitiveStride, subgroupSize) / subgroupSize;
4251 std::vector<uint32_t> outputP(primitiveStride + INVOCATION_ENTRY_OFFSET);
4252 outputP.at(NUM_SUBGROUPS_OFFSET) = subgroupCount;
4253 outputP.at(SUBGROUP_SIZE_OFFSET) = subgroupSize;
4254 outputP.at(INVOCATION_COUNT_OFFSET) = primitiveStride;
4255 outputP.at(MAX_LOC_OFFSET) = 0u;
4256 outputP.at(MAX_IDENTITY_OFFSET) = 0u;
4257 for (uint32_t vertexID = 0u; vertexID < primitiveStride; ++vertexID)
4258 {
4259 const uint32_t subgroupID = vertexID / subgroupSize;
4260 const uint32_t subgroupInvocationID = vertexID % subgroupSize;
4261 outputP.at(vertexID + INVOCATION_ENTRY_OFFSET) = ((subgroupID + 1u) << 16) | subgroupInvocationID;
4262 }
4263 return outputP;
4264 }
generateVectorOutputPvkt::Reconvergence::__anone030def80111::GeometryRandomProgram::Arrangement4265 static std::vector<uint32_t> generateVectorOutputP(uint32_t subgroupSize, uint32_t width, uint32_t height,
4266 uint32_t percent)
4267 {
4268 const uint32_t primitiveStride = calculatePrimitiveCount(width, height, percent);
4269 return generateVectorOutputP(subgroupSize, primitiveStride);
4270 }
4271 };
4272
simulate(bool countOnly,uint32_t subgroupSize,add_ref<std::vector<uint64_t>> ref)4273 virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) override
4274 {
4275 DE_ASSERT(false); // use overloaded version of simulate() instead
4276 DE_UNREF(countOnly);
4277 DE_UNREF(subgroupSize);
4278 DE_UNREF(ref);
4279 return 0;
4280 }
4281
4282 protected:
genIf(IFType ifType,uint32_t)4283 virtual void genIf(IFType ifType, uint32_t /*maxLocalIndexCmp*/) override
4284 {
4285 RandomProgram::genIf(ifType, RandomProgram::invocationStride);
4286 }
4287
getPartitionBallotText()4288 virtual std::string getPartitionBallotText() override
4289 {
4290 return "storeValue(outLoc++, subgroupBallot(true))";
4291 }
4292
printIfLocalInvocationIndex(add_ref<std::stringstream> css,add_cref<FlowState> flow)4293 virtual void printIfLocalInvocationIndex(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
4294 {
4295 printIndent(css);
4296 css << "if (invocationIndex() >= inputA.a[0x" << std::hex << flow.ops[flow.opsIndex].value << "]) {\n";
4297 }
4298
printStore(add_ref<std::stringstream> css,add_cref<FlowState> flow)4299 virtual void printStore(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
4300 {
4301 printIndent(css);
4302 css << "storeValue(outLoc++, 0x" << std::hex << flow.ops[flow.opsIndex].value << std::dec << ");\n";
4303 }
4304
printBallot(add_ref<std::stringstream> css,add_cref<FlowState>,bool endWithSemicolon=false)4305 virtual void printBallot(add_ref<std::stringstream> css, add_cref<FlowState>,
4306 bool endWithSemicolon = false) override
4307 {
4308 printIndent(css);
4309 // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
4310 // a ballot, to make sure the ballot is "diverged enough". Don't do this for
4311 // subgroup_uniform_control_flow, since we only validate results that must be fully
4312 // reconverged.
4313 if (loopNesting > 0 && caseDef.testType == TT_MAXIMAL)
4314 {
4315 css << getPartitionBallotText();
4316 }
4317 else
4318 {
4319 css << "storeValue(outLoc++, subgroupBallot(true))";
4320 }
4321 if (endWithSemicolon)
4322 {
4323 css << ";\n";
4324 }
4325 }
4326
simulateBallot(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t unusedPrimitiveID,const int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)4327 virtual void simulateBallot(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t unusedPrimitiveID,
4328 const int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
4329 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
4330 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
4331 const OPType reason, const tcu::UVec4 *cmp) override
4332 {
4333 DE_UNREF(unusedPrimitiveID);
4334 DE_UNREF(opsIndex);
4335 add_cref<Arrangement> a(*std::static_pointer_cast<Arrangement>(prerequisites));
4336 for (uint32_t primitiveID = 0u; primitiveID < a.m_primitiveStride; ++primitiveID)
4337 {
4338 const uint32_t sgid = a.m_primitiveSubgroups.at(primitiveID);
4339 DE_ASSERT(sgid < (a.m_subgroupCount * a.m_subgroupSize));
4340 if (false == activeMask.test(Ballots::findBit(sgid, a.m_subgroupSize)))
4341 continue;
4342 const uint32_t index = (outLoc.at(primitiveID)++) * a.m_invocationStride + primitiveID;
4343 if (false == countOnly)
4344 {
4345 ref.at(index) = Ballot(activeMask.at(sgid / a.m_subgroupSize));
4346 if (cmp && logFailureCount > 0u && cmp[index] != ref.at(index))
4347 {
4348 logFailureCount -= 1u;
4349 log << tcu::TestLog::Message << logFailureCount << ": stored value mismatch from "
4350 << OPtypeToStr(reason) << tcu::TestLog::EndMessage;
4351 }
4352 }
4353 }
4354 }
4355
simulateStore(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t unusedPrimitiveID,const uint64_t storeValue,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)4356 virtual void simulateStore(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t unusedPrimitiveID,
4357 const uint64_t storeValue, add_ref<std::vector<uint32_t>> outLoc,
4358 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
4359 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
4360 const OPType reason, const tcu::UVec4 *cmp) override
4361 {
4362 DE_UNREF(unusedPrimitiveID);
4363 add_cref<Arrangement> a(*std::static_pointer_cast<Arrangement>(prerequisites));
4364 for (uint32_t primitiveID = 0u; primitiveID < a.m_primitiveStride; ++primitiveID)
4365 {
4366 const uint32_t sgid = a.m_primitiveSubgroups.at(primitiveID);
4367 DE_ASSERT(sgid < (a.m_subgroupCount * a.m_subgroupSize));
4368 if (false == activeMask.test(Ballots::findBit(sgid, a.m_subgroupSize)))
4369 continue;
4370 const uint32_t index = (outLoc.at(primitiveID)++) * a.m_invocationStride + primitiveID;
4371 if (false == countOnly)
4372 {
4373 ref.at(index) = Ballot(tcu::UVec4(uint32_t(storeValue & 0xFFFFFFFF), 0u, 0u, 0u));
4374 if (cmp && logFailureCount > 0u && cmp[index] != ref.at(index))
4375 {
4376 logFailureCount -= 1u;
4377 log << tcu::TestLog::Message << logFailureCount << ": stored value mismatch from "
4378 << OPtypeToStr(reason) << tcu::TestLog::EndMessage;
4379 }
4380 }
4381 }
4382 }
4383
makePrerequisites(add_cref<std::vector<uint32_t>> outputP,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<SubgroupState2>> stateStack,add_ref<std::vector<uint32_t>> outLoc,add_ref<uint32_t> subgroupCount)4384 virtual std::shared_ptr<Prerequisites> makePrerequisites(add_cref<std::vector<uint32_t>> outputP,
4385 const uint32_t subgroupSize, const uint32_t fragmentStride,
4386 const uint32_t primitiveStride,
4387 add_ref<std::vector<SubgroupState2>> stateStack,
4388 add_ref<std::vector<uint32_t>> outLoc,
4389 add_ref<uint32_t> subgroupCount) override
4390 {
4391 DE_UNREF(fragmentStride);
4392 auto prerequisites = std::make_shared<Arrangement>(outputP, subgroupSize, primitiveStride);
4393 subgroupCount = prerequisites->m_subgroupCount;
4394 stateStack.resize(10u, SubgroupState2(subgroupCount));
4395 outLoc.resize(primitiveStride, 0u);
4396 stateStack.at(0).activeMask = prerequisites->m_initialBallots;
4397 return prerequisites;
4398 }
4399 };
4400
4401 class ReconvergenceTestCase : public TestCase
4402 {
4403 public:
ReconvergenceTestCase(tcu::TestContext & context,const std::string & name,const CaseDef data)4404 ReconvergenceTestCase(tcu::TestContext &context, const std::string &name, const CaseDef data)
4405 : TestCase(context, name)
4406 , m_data(data)
4407 , m_program()
4408 , m_subgroupSizeToMaxLoc()
4409 {
4410 }
4411 ~ReconvergenceTestCase(void) = default;
4412 virtual void delayedInit(void) override;
4413 virtual void checkSupport(Context &context) const override;
4414 virtual void initPrograms(SourceCollections &programCollection) const override;
4415 virtual TestInstance *createInstance(Context &context) const override;
4416 de::MovePtr<RandomProgram> selectProgram() const;
4417
4418 private:
4419 CaseDef m_data;
4420 std::shared_ptr<RandomProgram> m_program;
4421 mutable std::map<uint32_t, uint32_t> m_subgroupSizeToMaxLoc;
4422 };
4423
checkSupport(Context & context) const4424 void ReconvergenceTestCase::checkSupport(Context &context) const
4425 {
4426 if (!context.contextSupports(vk::ApiVersion(0u, 1u, 1u, 0u)))
4427 TCU_THROW(NotSupportedError, "Vulkan 1.1 not supported");
4428
4429 const auto properties = getSubgroupProperties(context);
4430 const vk::VkPhysicalDeviceSubgroupProperties &subgroupProperties = properties.first;
4431 const VkPhysicalDeviceLimits &limits = properties.second.properties.limits;
4432
4433 if (m_data.isElect() && !(subgroupProperties.supportedOperations & VK_SUBGROUP_FEATURE_BASIC_BIT))
4434 TCU_THROW(NotSupportedError, "VK_SUBGROUP_FEATURE_BASIC_BIT not supported");
4435
4436 if (!m_data.isElect() && !(subgroupProperties.supportedOperations & VK_SUBGROUP_FEATURE_BALLOT_BIT))
4437 TCU_THROW(NotSupportedError, "VK_SUBGROUP_FEATURE_BALLOT_BIT not supported");
4438
4439 if (m_data.shaderStage == VK_SHADER_STAGE_COMPUTE_BIT)
4440 {
4441 if ((m_data.sizeX > limits.maxComputeWorkGroupSize[0]) || (m_data.sizeY > limits.maxComputeWorkGroupSize[1]) ||
4442 ((m_data.sizeX * m_data.sizeY) > limits.maxComputeWorkGroupInvocations))
4443 {
4444 TCU_THROW(NotSupportedError, "compute workgroup count exceeds device limit");
4445 }
4446 }
4447
4448 if (!(subgroupProperties.supportedStages & m_data.shaderStage))
4449 {
4450 std::stringstream ss;
4451 ss << getShaderStageFlagsStr(m_data.shaderStage);
4452 ss << " does not support subgroup operations";
4453 ss.flush();
4454 TCU_THROW(NotSupportedError, ss.str());
4455 }
4456
4457 // Both subgroup- AND workgroup-uniform tests are enabled by shaderSubgroupUniformControlFlow.
4458 if (m_data.isUCF() && !context.getShaderSubgroupUniformControlFlowFeatures().shaderSubgroupUniformControlFlow)
4459 TCU_THROW(NotSupportedError, "shaderSubgroupUniformControlFlow not supported");
4460
4461 if (m_data.testType == TT_MAXIMAL && !context.getShaderMaximalReconvergenceFeatures().shaderMaximalReconvergence)
4462 TCU_THROW(NotSupportedError, "shaderMaximalReconvergence not supported");
4463 }
4464
selectProgram() const4465 de::MovePtr<RandomProgram> ReconvergenceTestCase::selectProgram() const
4466 {
4467 RandomProgram *programPtr(nullptr);
4468 switch (m_data.shaderStage)
4469 {
4470 case VK_SHADER_STAGE_COMPUTE_BIT:
4471 programPtr = new ComputeRandomProgram(m_data);
4472 break;
4473 case VK_SHADER_STAGE_FRAGMENT_BIT:
4474 programPtr = new FragmentRandomProgram(m_data);
4475 break;
4476 case VK_SHADER_STAGE_VERTEX_BIT:
4477 programPtr = new VertexRandomProgram(m_data);
4478 break;
4479 case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
4480 programPtr = new TessCtrlRandomProgram(m_data, 0);
4481 break;
4482 case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
4483 programPtr = new TessEvalRandomProgram(m_data);
4484 break;
4485 case VK_SHADER_STAGE_GEOMETRY_BIT:
4486 programPtr = new GeometryRandomProgram(m_data);
4487 break;
4488 default:
4489 DE_ASSERT(0);
4490 }
4491 DE_ASSERT(programPtr);
4492 return de::MovePtr<RandomProgram>(programPtr);
4493 }
4494
genPassThroughFragmentSource()4495 std::string genPassThroughFragmentSource()
4496 {
4497 std::stringstream str;
4498 str << "#version 450 core\n";
4499 str << "layout(location = 0) out vec4 color;\n";
4500 str << "void main() {\n";
4501 str << " color = vec4(1.0);\n";
4502 str << "}\n";
4503 str.flush();
4504 return str.str();
4505 }
4506
genPassThroughVertexSource()4507 std::string genPassThroughVertexSource()
4508 {
4509 std::stringstream str;
4510 str << "#version 450 core\n";
4511 str << "layout(location = 0) in vec4 pos;\n";
4512 str << "void main() {\n";
4513 str << " gl_Position = vec4(pos.xy, 0.0, 1.0);\n";
4514 str << "}\n";
4515 str.flush();
4516 return str.str();
4517 }
4518
genPassThroughTessCtrlSource()4519 std::string genPassThroughTessCtrlSource()
4520 {
4521 std::stringstream str;
4522 str << "#version 450 core\n";
4523 str << "#extension GL_EXT_tessellation_shader : require\n";
4524 str << "layout(vertices = 3) out;\n";
4525 str << "void main() {\n";
4526 str << " gl_out[gl_InvocationID].gl_Position = gl_in[gl_InvocationID].gl_Position;\n";
4527 str << " gl_TessLevelOuter[0] = 1.0;\n";
4528 str << " gl_TessLevelOuter[1] = 1.0;\n";
4529 str << " gl_TessLevelOuter[2] = 1.0;\n";
4530 str << " gl_TessLevelOuter[3] = 1.0;\n";
4531 str << " gl_TessLevelInner[0] = 1.0;\n";
4532 str << " gl_TessLevelInner[1] = 1.0;\n";
4533 str << "}\n";
4534 str.flush();
4535 return str.str();
4536 }
4537
genPassThroughTessEvalSource()4538 std::string genPassThroughTessEvalSource()
4539 {
4540 std::stringstream str;
4541 str << "#version 450 core\n";
4542 str << "#extension GL_EXT_tessellation_shader : require\n";
4543 str << "layout(equal_spacing, triangles) in;\n";
4544 str << "void main() {\n";
4545 str << " float u = gl_TessCoord.x;\n";
4546 str << " float v = gl_TessCoord.y;\n";
4547 str << " float w = gl_TessCoord.z;\n";
4548 str << " vec4 p0 = vec4(gl_in[0].gl_Position.xy, 0.0, 1.0);\n";
4549 str << " vec4 p1 = vec4(gl_in[1].gl_Position.xy, 0.0, 1.0);\n";
4550 str << " vec4 p2 = vec4(gl_in[2].gl_Position.xy, 0.0, 1.0);\n";
4551 str << " gl_Position = u * p0 + v * p1 + w * p2;\n";
4552 str << "}\n";
4553 str.flush();
4554 return str.str();
4555 }
4556
delayedInit(void)4557 void ReconvergenceTestCase::delayedInit(void)
4558 {
4559 m_program = std::shared_ptr<RandomProgram>(selectProgram().release());
4560 }
4561
initPrograms(SourceCollections & programCollection) const4562 void ReconvergenceTestCase::initPrograms(SourceCollections &programCollection) const
4563 {
4564 de::MovePtr<RandomProgram> program = selectProgram();
4565
4566 m_subgroupSizeToMaxLoc = program->generateRandomProgram(m_testCtx.getWatchDog(), m_testCtx.getLog());
4567
4568 std::stringstream header, layout, globals, prologue, epilogue, aux;
4569
4570 header << "#version 450 core\n";
4571 header << "#extension GL_KHR_shader_subgroup_ballot : enable\n";
4572 header << "#extension GL_KHR_shader_subgroup_vote : enable\n";
4573 header << "#extension GL_NV_shader_subgroup_partitioned : enable\n";
4574 header << "#extension GL_EXT_subgroup_uniform_control_flow : enable\n";
4575 if (m_data.testType == TT_MAXIMAL)
4576 {
4577 header << "#extension GL_EXT_maximal_reconvergence : require\n";
4578 }
4579 switch (m_data.shaderStage)
4580 {
4581 case VK_SHADER_STAGE_COMPUTE_BIT:
4582 layout << "layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z = 1) in;\n";
4583 layout << "layout(set=0, binding=2) coherent buffer OutputC { uint loc[]; } outputC;\n";
4584 layout << "layout(set=0, binding=1) coherent buffer OutputB { uvec4 b[]; } outputB;\n";
4585 layout << "layout(set=0, binding=0) coherent buffer InputA { uint a[]; } inputA;\n";
4586 break;
4587 case VK_SHADER_STAGE_FRAGMENT_BIT:
4588 layout << "// NOTE: A fragment can belong to more than one primitive, and the shader processes each\n";
4589 layout << "// fragment primitive by primitive, so the number of invocation does not have to be\n";
4590 layout << "// equal to the number of fragments of the rendering area. Another important thing\n";
4591 layout << "// is that the Implementation is free to change the order of draving primitives\n";
4592 layout << "// between subsequent application calls.\n";
4593
4594 layout << "// inputA.a[ invocationStride ] = { 0, 1, ..., (invocationStride - 1) }\n";
4595 layout << "layout(set=0, binding=0) coherent buffer InputA { uint a[]; } inputA;\n";
4596
4597 layout << "// outputB.b[ max(loc[]) * invocationStride * primitiveStride ]\n";
4598 layout << "layout(set=0, binding=1) coherent buffer OutputB { uvec4 b[]; } outputB;\n";
4599
4600 layout << "// outputC.c[invocationStride * primitiveStride ], incremented per primitive\n";
4601 layout << "layout(set=0, binding=2) coherent buffer OutputC { uint loc[]; } outputC;\n";
4602
4603 layout << "// outputP.p[ width * height * primitiveStride + 1 ], one more for calculating subgroupID\n";
4604 layout << "layout(set=0, binding=3) coherent buffer OutputP { uint p[]; } outputP;\n";
4605
4606 layout << "layout(location = 0) out vec4 dEQP_FragColor;\n";
4607 break;
4608 case VK_SHADER_STAGE_VERTEX_BIT:
4609 layout << "layout(location = 0) in vec4 pos;\n";
4610 layout << "layout(set=0, binding=3) coherent buffer OutputP { uint p[]; } outputP;\n";
4611 layout << "layout(set=0, binding=2) coherent buffer OutputC { uint loc[]; } outputC;\n";
4612 layout << "layout(set=0, binding=1) coherent buffer OutputB { uvec4 b[]; } outputB;\n";
4613 layout << "layout(set=0, binding=0) coherent buffer InputA { uint a[]; } inputA;\n";
4614 break;
4615 case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
4616 layout << "#extension GL_EXT_tessellation_shader : require\n";
4617 layout << "layout(vertices = " << TessCtrlRandomProgram::minSubgroupSize << ") out;\n";
4618 layout << "layout(set=0, binding=2) coherent buffer OutputC { uint loc[]; } outputC;\n";
4619 layout << "layout(set=0, binding=1) coherent buffer OutputB { uvec2 b[]; } outputB;\n";
4620 layout << "layout(set=0, binding=0) coherent buffer InputA { uint a[]; } inputA;\n";
4621 break;
4622 case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
4623 layout << "#extension GL_EXT_tessellation_shader : require\n";
4624 layout << "layout(equal_spacing, quads) in;\n";
4625 layout << "layout(set=0, binding=2) coherent buffer OutputC { uint loc[]; } outputC;\n";
4626 layout << "layout(set=0, binding=1) coherent buffer OutputB { uvec2 b[]; } outputB;\n";
4627 layout << "layout(set=0, binding=0) coherent buffer InputA { uint a[]; } inputA;\n";
4628 break;
4629 case VK_SHADER_STAGE_GEOMETRY_BIT:
4630 layout << "#extension GL_EXT_geometry_shader : require\n";
4631 layout << "layout(points) in;\n";
4632 layout << "layout(points, max_vertices = 1) out;\n";
4633 layout << "layout(set=0, binding=3) coherent buffer OutputP { uint p[]; } outputP;\n";
4634 layout << "layout(set=0, binding=2) coherent buffer OutputC { uint loc[]; } outputC;\n";
4635 layout << "layout(set=0, binding=1) coherent buffer OutputB { uvec4 b[]; } outputB;\n";
4636 layout << "layout(set=0, binding=0) coherent buffer InputA { uint a[]; } inputA;\n";
4637 break;
4638 default:
4639 DE_ASSERT(0);
4640 }
4641
4642 std::stringstream pushConstantLayout;
4643 pushConstantLayout
4644 << "layout(push_constant) uniform PC {\n"
4645 " // set to the real stride when writing out ballots, or zero when just counting\n"
4646 " int invocationStride;\n"
4647 " // wildcard fields, for an example the dimensions of rendered area in the case of graphics shaders\n"
4648 " int width;\n"
4649 " int height;\n"
4650 " uint primitiveStride;\n"
4651 " uint subgroupStride;\n"
4652 " uint enableInvocationIndex;\n"
4653 "};\n";
4654 pushConstantLayout.flush();
4655 layout << pushConstantLayout.str();
4656
4657 globals << "int outLoc = 0;\n";
4658 globals << "bool testBit(uvec4 mask, uint bit) { return ((mask[bit / 32] >> (bit % 32)) & 1) != 0; }\n";
4659 globals << "uint elect() { return int(subgroupElect()) + 1; }\n";
4660 if (m_data.shaderStage == VK_SHADER_STAGE_FRAGMENT_BIT)
4661 {
4662 static const std::string helperRoutinesCode(R"glsl(
4663 void setBit(uint bit, in out uvec4 ballot) {
4664 uint c = bit / 32;
4665 switch (c) {
4666 case 0: ballot.x |= (1u << (bit % 32)); break;
4667 case 1: ballot.y |= (1u << (bit % 32)); break;
4668 case 2: ballot.z |= (1u << (bit % 32)); break;
4669 case 3: ballot.w |= (1u << (bit % 32)); break;
4670 }
4671 }
4672 void resetBit(uint bit, in out uvec4 ballot) {
4673 uint c = bit / 32;
4674 uint mask = 0xFFFFFFFF ^ (1u << (bit % 32));
4675 switch (c) {
4676 case 0: ballot.x &= mask; break;
4677 case 1: ballot.y &= mask; break;
4678 case 2: ballot.z &= mask; break;
4679 case 3: ballot.w &= mask; break;
4680 }
4681 }
4682 uint fragmentIndex() { return (uint(gl_FragCoord.y) * width + uint(gl_FragCoord.x)); }
4683 uint invocationIndex() { return subgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; }
4684 uvec4 invocationElectBallot() {
4685 uvec4 ballot = uvec4(0);
4686 ballot[gl_SubgroupInvocationID / 32] = (1 << (gl_SubgroupInvocationID % 32));
4687 return ballot;
4688 }
4689 uint next(uint hint) {
4690 return gl_HelperInvocation
4691 ? (hint * enableInvocationIndex)
4692 : outputC.loc[(gl_PrimitiveID * (subgroupStride * 128) + invocationIndex()) * enableInvocationIndex]++;
4693 }
4694 uint index(uint hint) {
4695 return ((
4696 next(hint) * (subgroupStride * 128 * primitiveStride)
4697 + (gl_PrimitiveID * subgroupStride * 128) + invocationIndex()) * enableInvocationIndex);
4698 }
4699 void storeValue(uint hintIndex, uvec4 value)
4700 {
4701 if (gl_HelperInvocation) {
4702 if (hintIndex < BALLOT_STACK_SIZE)
4703 ballotStack[hintIndex] = value;
4704 }
4705 else {
4706 outputB.b[index(hintIndex)] = value;
4707 }
4708 }
4709 void storeValue(uint hintIndex, uint value) { storeValue(hintIndex, uvec4(value, 0, 0, 0)); }
4710 void storeBallot(uint hintIndex) { storeValue(hintIndex, subgroupBallot(true)); }
4711 )glsl");
4712
4713 static const std::string prologueCode(R"glsl(
4714 uint helperInvocationCount = 0u;
4715 uint nonHelperInvocationCount = 0u;
4716 uvec4 helperInvocationsBits = uvec4(0, 0, 0, 0);
4717 uvec4 nonHelperInvocationsBits = uvec4(0, 0, 0, 0);
4718 if (gl_HelperInvocation)
4719 {
4720 helperInvocationsBits = subgroupBallot(true);
4721 helperInvocationCount = 1u;
4722 }
4723 else
4724 {
4725 nonHelperInvocationsBits = subgroupBallot(true);
4726 nonHelperInvocationCount = 1u;
4727 }
4728
4729 helperInvocationsBits = subgroupOr(helperInvocationsBits);
4730 nonHelperInvocationsBits = subgroupOr(nonHelperInvocationsBits);
4731 uint helperBitCount = subgroupBallotBitCount(helperInvocationsBits);
4732 uint nonHelperBitCount = subgroupBallotBitCount(nonHelperInvocationsBits);
4733 helperInvocationCount = subgroupAdd(helperInvocationCount);
4734 nonHelperInvocationCount = subgroupAdd(nonHelperInvocationCount);
4735
4736 const uint nonHelperElectBit = subgroupBallotFindLSB(nonHelperInvocationsBits);
4737 if (gl_SubgroupInvocationID == nonHelperElectBit)
4738 {
4739 subgroupID = atomicAdd(outputP.p[width * height * primitiveStride + 0], 1);
4740 outputP.p[width * height * primitiveStride + 1] = gl_SubgroupSize;
4741 atomicAdd(outputP.p[width * height * primitiveStride + 2], nonHelperInvocationCount);
4742 atomicAdd(outputP.p[width * height * primitiveStride + 3], helperInvocationCount);
4743 }
4744
4745 subgroupID = subgroupShuffle(subgroupID, nonHelperElectBit);
4746
4747 const uint localPrimitiveID = gl_PrimitiveID;
4748 const uint localFragmentID = fragmentIndex();
4749
4750 if (!gl_HelperInvocation)
4751 {
4752 outputP.p[localFragmentID * primitiveStride + localPrimitiveID] =
4753 ((subgroupID + 1) << 16) | gl_SubgroupInvocationID;
4754 }
4755
4756 // Maping helper invocations block
4757 {
4758 uvec4 tmpHelperBits = helperInvocationsBits;
4759 uint helperSubgroupInvocationID = subgroupBallotFindLSB(tmpHelperBits);
4760 while (subgroupBallotBitExtract(tmpHelperBits, helperSubgroupInvocationID))
4761 {
4762 uint helperSubgroupID = subgroupShuffle(subgroupID, helperSubgroupInvocationID);
4763 uint helperFragmentID = subgroupShuffle(localFragmentID, helperSubgroupInvocationID);
4764 uint helperPrimitiveID = subgroupShuffle(localPrimitiveID, helperSubgroupInvocationID);
4765 if (gl_SubgroupInvocationID == nonHelperElectBit)
4766 {
4767 outputP.p[helperFragmentID * primitiveStride + helperPrimitiveID] =
4768 (((helperSubgroupID + 1) | 0x8000) << 16) | helperSubgroupInvocationID;
4769 }
4770 resetBit(helperSubgroupInvocationID, tmpHelperBits);
4771 helperSubgroupInvocationID = subgroupBallotFindLSB(tmpHelperBits);
4772 }
4773 }
4774 )glsl");
4775
4776 static const std::string epilogueCode(R"glsl(
4777 // Save helper invocations entries block
4778 {
4779 uvec4 tmpHelperBits = subgroupOr(helperInvocationsBits);
4780 uint helperSubgroupInvocationID = subgroupBallotFindLSB(tmpHelperBits);
4781 while (helperSubgroupInvocationID < gl_SubgroupSize)
4782 {
4783 const uint maxOutLoc = subgroupShuffle(outLoc, helperSubgroupInvocationID);
4784 if (maxOutLoc == 0)
4785 {
4786 resetBit(helperSubgroupInvocationID, tmpHelperBits);
4787 helperSubgroupInvocationID = subgroupBallotFindLSB(tmpHelperBits);
4788 continue;
4789 }
4790
4791 uvec4 helperBallotStack[BALLOT_STACK_SIZE];
4792 uint helperSubgroupID = subgroupShuffle(subgroupID, helperSubgroupInvocationID);
4793 uint helperFragmentID = subgroupShuffle(localFragmentID, helperSubgroupInvocationID);
4794 uint helperPrimitiveID = subgroupShuffle(localPrimitiveID, helperSubgroupInvocationID);
4795 for (uint i = 0; i < maxOutLoc && i < BALLOT_STACK_SIZE; i++) {
4796 helperBallotStack[i] = subgroupShuffle(ballotStack[i], helperSubgroupInvocationID);
4797 }
4798
4799 if (gl_SubgroupInvocationID == nonHelperElectBit)
4800 {
4801 uint helperInvocationIndex = helperSubgroupID * gl_SubgroupSize + helperSubgroupInvocationID;
4802 uint helperPrimitiveInvocationIndex = helperInvocationIndex * primitiveStride + helperPrimitiveID;
4803
4804 outputC.loc[(helperInvocationIndex * primitiveStride + helperPrimitiveID) * enableInvocationIndex] = maxOutLoc;
4805
4806 for (uint j = 0; j < maxOutLoc; j++)
4807 {
4808 uint outputIndex = ((j * (subgroupStride * 128u * primitiveStride)
4809 + (helperPrimitiveID * subgroupStride * 128u) + helperInvocationIndex) * enableInvocationIndex);
4810 uvec4 outputValue = (j < BALLOT_STACK_SIZE) ? helperBallotStack[j] : uvec4(0,0,0,0);
4811 outputB.b[outputIndex] = outputValue;
4812 }
4813 }
4814 resetBit(helperSubgroupInvocationID, tmpHelperBits);
4815 helperSubgroupInvocationID = subgroupBallotFindLSB(tmpHelperBits);
4816 } // wend
4817 }
4818
4819 dEQP_FragColor = vec4(1.0);
4820 )glsl");
4821
4822 header << "#extension GL_KHR_shader_subgroup_shuffle : enable\n";
4823 header << "#extension GL_KHR_shader_subgroup_arithmetic : enable\n";
4824 header << "#define BALLOT_STACK_SIZE " << FragmentRandomProgram::experimentalOutLocSize << '\n';
4825
4826 {
4827 aux << header.str();
4828 aux << pushConstantLayout.str();
4829 aux << "uint outLoc = 0;\n";
4830 aux << "struct OutputC { uint loc[1]; };\n";
4831 aux << "struct OutputB { uvec4 b[1]; };\n";
4832 aux << "uint subgroupID = 11111;\n";
4833 aux << "uvec4 ballotStack[BALLOT_STACK_SIZE];\n";
4834 aux << "OutputC outputC;\n";
4835 aux << "OutputB outputB;\n";
4836 aux << "// OutputP.p[ width * height * primitiveStride + 4 ], few more for calculating subgroupID, "
4837 "subgroupSize, non-helper and helper invocations\n";
4838 aux << "layout(set = 0, binding = 0) coherent buffer OutputP { uint p[]; } outputP;\n";
4839 aux << "layout(location = 0) out vec4 dEQP_FragColor;\n";
4840 aux << helperRoutinesCode;
4841 aux << "void main() {\n"
4842 << prologueCode << epilogueCode << " \n"
4843 << "}\n";
4844 }
4845
4846 globals << "uint subgroupID = 22222;\n";
4847 globals << "uvec4 ballotStack[BALLOT_STACK_SIZE];\n";
4848 globals << helperRoutinesCode;
4849
4850 prologue << prologueCode;
4851 epilogue << epilogueCode;
4852 }
4853 else if (m_data.shaderStage == VK_SHADER_STAGE_VERTEX_BIT)
4854 {
4855 static const std::string helperRoutinesCode(R"glsl(
4856 uint invocationIndex() { return subgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; }
4857 uvec4 invocationElectBallot() {
4858 uvec4 ballot = uvec4(0);
4859 ballot[gl_SubgroupInvocationID / 32] = (1 << (gl_SubgroupInvocationID % 32));
4860 return ballot;
4861 }
4862 void storeValue(uint loc, uvec4 value) {
4863 outputC.loc[gl_VertexIndex] = loc + 1u;
4864 outputB.b[(loc * invocationStride + gl_VertexIndex) * enableInvocationIndex] = value;
4865 }
4866 void storeValue(uint loc, uint value) { storeValue(loc, uvec4(value, 0, 0, 0)); }
4867 )glsl");
4868
4869 static const std::string prologueCode(R"glsl(
4870 uint invocationCount = 1u;
4871 invocationCount = subgroupAdd(invocationCount);
4872
4873 if (subgroupElect())
4874 {
4875 subgroupID = atomicAdd(outputP.p[NUM_SUBGROUPS_OFFSET], 1u); // [+0] subgroupID
4876 outputP.p[SUBGROUP_SIZE_OFFSET] = gl_SubgroupSize; // [+1] subgroupSize
4877 atomicAdd(outputP.p[INVOCATION_COUNT_OFFSET], invocationCount); // [+2] invocationCount
4878 }
4879 subgroupID = subgroupBroadcastFirst(subgroupID);
4880
4881 outputP.p[gl_VertexIndex + INVOCATION_ENTRIES_OFFSET] = ((subgroupID + 1) << 16) | gl_SubgroupInvocationID;
4882 )glsl");
4883
4884 static const std::string epilogueCode(R"glsl(
4885 gl_Position = vec4(pos.xy, 0.0, 1.0);
4886 gl_PointSize = 1.0;
4887 )glsl");
4888
4889 header << "#extension GL_KHR_shader_subgroup_arithmetic : enable\n";
4890 header << "#define NUM_SUBGROUPS_OFFSET 0\n";
4891 header << "#define SUBGROUP_SIZE_OFFSET 1\n";
4892 header << "#define INVOCATION_COUNT_OFFSET 2\n";
4893 header << "#define INVOCATION_ENTRIES_OFFSET 3\n";
4894
4895 globals << "uint subgroupID = 33333;\n";
4896 globals << helperRoutinesCode;
4897
4898 prologue << prologueCode;
4899 epilogue << epilogueCode;
4900 }
4901 else if (m_data.shaderStage == VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT)
4902 {
4903 // push_constant::width holds the smallest subgroup size defined in TessCtrlRandomProgram::minSubgroupSize
4904 globals << "// push_constant::width is the smallest subgroup size which this shader is run on\n";
4905 globals << "uint invocationIndex() { return ((((gl_PrimitiveID * width) / gl_SubgroupSize) * gl_SubgroupSize) "
4906 "+ gl_SubgroupInvocationID); }\n";
4907
4908 epilogue
4909 << " gl_out[gl_InvocationID].gl_Position = gl_in[gl_InvocationID % gl_PatchVerticesIn].gl_Position;\n";
4910 epilogue << " gl_TessLevelOuter[0] = 1.0;\n";
4911 epilogue << " gl_TessLevelOuter[1] = 1.0;\n";
4912 epilogue << " gl_TessLevelOuter[2] = 1.0;\n";
4913 epilogue << " gl_TessLevelOuter[3] = 1.0;\n";
4914 epilogue << " gl_TessLevelInner[0] = 1.0;\n";
4915 epilogue << " gl_TessLevelInner[1] = 1.0;\n";
4916 }
4917 else if (m_data.shaderStage == VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT)
4918 {
4919 globals << "// push_constant::width is an invocation count when processing a quad for a single patch\n";
4920 globals << "uint invocationIndex() { return ((((gl_PrimitiveID * width) / gl_SubgroupSize) * gl_SubgroupSize) "
4921 "+ gl_SubgroupInvocationID); }\n";
4922
4923 epilogue << " float u = gl_TessCoord.x;\n";
4924 epilogue << " float v = gl_TessCoord.y;\n";
4925 epilogue << " float w = gl_TessCoord.z;\n";
4926 epilogue << " vec4 p0 = vec4(gl_in[0].gl_Position.xy, 0.0, 1.0);\n";
4927 epilogue << " vec4 p1 = vec4(gl_in[1].gl_Position.xy, 0.0, 1.0);\n";
4928 epilogue << " vec4 p2 = vec4(gl_in[2].gl_Position.xy, 0.0, 1.0);\n";
4929 epilogue << " gl_Position = u * p0 + v * p1 + w * p2;\n";
4930 }
4931 else if (m_data.shaderStage == VK_SHADER_STAGE_GEOMETRY_BIT)
4932 {
4933 static const std::string helperRoutinesCode(R"glsl(
4934 uint invocationIndex() { return subgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; }
4935 void storeValue(uint loc, uvec4 value) {
4936 outputC.loc[gl_PrimitiveIDIn] = loc + 1u;
4937 outputB.b[(loc * invocationStride + gl_PrimitiveIDIn) * enableInvocationIndex] = value;
4938 }
4939 void storeValue(uint loc, uint value) { storeValue(loc, uvec4(value, 0, 0, 0)); }
4940 void storeBallot(uint loc) { storeValue(loc, subgroupBallot(true)); }
4941 uvec4 invocationElectBallot() {
4942 uvec4 ballot = uvec4(0);
4943 ballot[gl_SubgroupInvocationID / 32] = (1 << (gl_SubgroupInvocationID % 32));
4944 return ballot;
4945 }
4946 )glsl");
4947
4948 static const std::string prologueCode(R"glsl(
4949 uint invocationCount = 1u;
4950 invocationCount = subgroupAdd(invocationCount);
4951 uint identity = gl_PrimitiveIDIn + 1u;
4952 uint maxIdentity = subgroupMax(identity);
4953
4954 if (subgroupElect()) {
4955 subgroupID = atomicAdd(outputP.p[SUBGROUP_ID_OFFSET], 1u); // [+0] subgroupID
4956 outputP.p[SUBGROUP_SIZE_OFFSET] = gl_SubgroupSize; // [+1] subgroupSize
4957 atomicAdd(outputP.p[INVOCATION_COUNT_OFFSET], invocationCount); // [+2] invocationCount
4958 atomicMax(outputP.p[MAX_IDENTITY_OFFSET], maxIdentity);
4959 }
4960 subgroupID = subgroupBroadcastFirst(subgroupID);
4961
4962 outputP.p[gl_PrimitiveIDIn + INVOCATION_ENTRY_OFFSET] = ((subgroupID + 1) << 16) | gl_SubgroupInvocationID;
4963
4964 )glsl");
4965
4966 static const std::string epilogueCode(R"glsl(
4967 uint maxLoc = subgroupMax(outLoc);
4968 atomicMax(outputP.p[MAX_LOC_OFFSET], maxLoc);
4969
4970 gl_Position = gl_in[gl_PrimitiveIDIn].gl_Position;
4971 gl_PrimitiveID = gl_PrimitiveIDIn;
4972
4973 EmitVertex();
4974 EndPrimitive();
4975 )glsl");
4976
4977 header << "#extension GL_KHR_shader_subgroup_arithmetic : enable\n";
4978 header << "#define SUBGROUP_ID_OFFSET 0\n";
4979 header << "#define SUBGROUP_SIZE_OFFSET 1\n";
4980 header << "#define INVOCATION_COUNT_OFFSET 2\n";
4981 header << "#define MAX_LOC_OFFSET 3\n";
4982 header << "#define MAX_IDENTITY_OFFSET 4\n";
4983 header << "#define INVOCATION_ENTRY_OFFSET 5\n";
4984
4985 globals << "uint subgroupID;\n";
4986 globals << "uint numSubgroups;\n";
4987 globals << helperRoutinesCode;
4988
4989 prologue << prologueCode;
4990 epilogue << epilogueCode;
4991 }
4992
4993 std::stringstream css, functions, main;
4994 m_program->printCode(functions, main);
4995
4996 css << header.str();
4997 css << layout.str();
4998 css << globals.str();
4999
5000 css << functions.str() << "\n\n";
5001
5002 css << "void main()\n"
5003 << (m_data.isSUCF() ? "[[subgroup_uniform_control_flow]]\n" : "")
5004 << (m_data.testType == TT_MAXIMAL ? "[[maximally_reconverges]]\n" : "") << "{\n";
5005
5006 css << prologue.str() << "\n";
5007 css << main.str() << "\n\n";
5008 css << epilogue.str() << "\n";
5009
5010 css << "}\n";
5011
5012 const vk::ShaderBuildOptions buildOptions(programCollection.usedVulkanVersion, vk::SPIRV_VERSION_1_3, 0u);
5013
5014 auto &testingShader = programCollection.glslSources.add("test");
5015 switch (m_data.shaderStage)
5016 {
5017 case VK_SHADER_STAGE_COMPUTE_BIT:
5018 testingShader << glu::ComputeSource(css.str()) << buildOptions;
5019 break;
5020 case VK_SHADER_STAGE_FRAGMENT_BIT:
5021 testingShader << glu::FragmentSource(css.str()) << buildOptions;
5022 programCollection.glslSources.add("vert") << glu::VertexSource(genPassThroughVertexSource()) << buildOptions;
5023 programCollection.glslSources.add("aux") << glu::FragmentSource(aux.str()) << buildOptions;
5024 break;
5025 case VK_SHADER_STAGE_VERTEX_BIT:
5026 testingShader << glu::VertexSource(css.str()) << buildOptions;
5027 programCollection.glslSources.add("frag") << glu::FragmentSource(genPassThroughFragmentSource());
5028 break;
5029 case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
5030 testingShader << glu::TessellationControlSource(css.str()) << buildOptions;
5031 programCollection.glslSources.add("vert") << glu::VertexSource(genPassThroughVertexSource());
5032 programCollection.glslSources.add("frag") << glu::FragmentSource(genPassThroughFragmentSource());
5033 programCollection.glslSources.add("tese") << glu::TessellationEvaluationSource(genPassThroughTessEvalSource());
5034 break;
5035 case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
5036 testingShader << glu::TessellationEvaluationSource(css.str()) << buildOptions;
5037 programCollection.glslSources.add("vert") << glu::VertexSource(genPassThroughVertexSource());
5038 programCollection.glslSources.add("frag") << glu::FragmentSource(genPassThroughFragmentSource());
5039 programCollection.glslSources.add("tesc") << glu::TessellationControlSource(genPassThroughTessCtrlSource());
5040 break;
5041 case VK_SHADER_STAGE_GEOMETRY_BIT:
5042 testingShader << glu::GeometrySource(css.str()) << buildOptions;
5043 programCollection.glslSources.add("vert") << glu::VertexSource(genPassThroughVertexSource());
5044 programCollection.glslSources.add("frag") << glu::FragmentSource(genPassThroughFragmentSource());
5045 break;
5046 default:
5047 DE_ASSERT(0);
5048 }
5049 }
5050
createInstance(Context & context) const5051 TestInstance *ReconvergenceTestCase::createInstance(Context &context) const
5052 {
5053 switch (m_data.shaderStage)
5054 {
5055 case VK_SHADER_STAGE_COMPUTE_BIT:
5056 return new ReconvergenceTestComputeInstance(context, m_data, m_program, std::move(m_subgroupSizeToMaxLoc));
5057 case VK_SHADER_STAGE_FRAGMENT_BIT:
5058 return new ReconvergenceTestFragmentInstance(context, m_data);
5059 case VK_SHADER_STAGE_VERTEX_BIT:
5060 return new ReconvergenceTestVertexInstance(context, m_data);
5061 case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
5062 return new ReconvergenceTestTessCtrlInstance(context, m_data);
5063 case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
5064 return new ReconvergenceTestTessEvalInstance(context, m_data);
5065 case VK_SHADER_STAGE_GEOMETRY_BIT:
5066 return new ReconvergenceTestGeometryInstance(context, m_data);
5067 default:
5068 DE_ASSERT(false);
5069 }
5070 return nullptr;
5071 }
5072
iterate(void)5073 tcu::TestStatus ReconvergenceTestComputeInstance::iterate(void)
5074 {
5075 const DeviceInterface &vk = m_context.getDeviceInterface();
5076 const VkDevice device = m_context.getDevice();
5077 Allocator &allocator = m_context.getDefaultAllocator();
5078 tcu::TestLog &log = m_context.getTestContext().getLog();
5079 const VkPhysicalDeviceLimits &limits = m_context.getDeviceProperties().limits;
5080
5081 const uint32_t invocationStride = m_data.sizeX * m_data.sizeY;
5082
5083 std::vector<tcu::UVec4> ref;
5084 add_ref<ComputeRandomProgram> program(*m_program);
5085
5086 uint32_t precalculatedMaxLoc = 0u;
5087 if (auto itPrecalculatedMaxLoc = m_subgroupSizeToMaxLoc.find(m_subgroupSize);
5088 itPrecalculatedMaxLoc != m_subgroupSizeToMaxLoc.end())
5089 {
5090 precalculatedMaxLoc = itPrecalculatedMaxLoc->second;
5091 }
5092 uint32_t maxLoc = precalculatedMaxLoc ? precalculatedMaxLoc :
5093 program.execute(m_context.getTestContext().getWatchDog(), true,
5094 m_subgroupSize, 0u, invocationStride, ref, log);
5095 uint32_t shaderMaxLoc = maxLoc;
5096
5097 // maxLoc is per-invocation. Add one (to make sure no additional writes are done) and multiply by
5098 // the number of invocations
5099 maxLoc++;
5100 maxLoc *= invocationStride;
5101
5102 // buffer[0] is an input filled with a[i] == i
5103 // buffer[1] is the output
5104 // buffer[2] is the location counts
5105 de::MovePtr<BufferWithMemory> buffers[3];
5106 vk::VkDescriptorBufferInfo bufferDescriptors[3];
5107
5108 VkDeviceSize sizes[3] = {
5109 invocationStride * sizeof(uint32_t),
5110 maxLoc * sizeof(tcu::UVec4),
5111 invocationStride * sizeof(uint32_t),
5112 };
5113
5114 for (uint32_t i = 0; i < 3; ++i)
5115 {
5116 if (sizes[i] > limits.maxStorageBufferRange)
5117 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
5118
5119 try
5120 {
5121 buffers[i] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
5122 vk, device, allocator,
5123 makeBufferCreateInfo(sizes[i], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
5124 VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
5125 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
5126 }
5127 catch (tcu::ResourceError &)
5128 {
5129 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
5130 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
5131 "Failed device memory allocation " + de::toString(sizes[i]) + " bytes");
5132 }
5133 bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, sizes[i]);
5134 }
5135
5136 void *ptrs[3];
5137 for (uint32_t i = 0; i < 3; ++i)
5138 {
5139 ptrs[i] = buffers[i]->getAllocation().getHostPtr();
5140 }
5141 for (uint32_t i = 0; i < sizes[0] / sizeof(uint32_t); ++i)
5142 {
5143 ((uint32_t *)ptrs[0])[i] = i;
5144 }
5145 deMemset(ptrs[1], 0, (size_t)sizes[1]);
5146 deMemset(ptrs[2], 0, (size_t)sizes[2]);
5147
5148 vk::DescriptorSetLayoutBuilder layoutBuilder;
5149
5150 layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_data.shaderStage);
5151 layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_data.shaderStage);
5152 layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_data.shaderStage);
5153
5154 vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
5155
5156 vk::Unique<vk::VkDescriptorPool> descriptorPool(
5157 vk::DescriptorPoolBuilder()
5158 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 3u)
5159 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
5160 vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
5161
5162 const VkPushConstantRange pushConstantRange = {
5163 (VkShaderStageFlags)m_data.shaderStage, // VkShaderStageFlags stageFlags;
5164 0u, // uint32_t offset;
5165 sizeof(PushConstant) // uint32_t size;
5166 };
5167
5168 const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = {
5169 VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
5170 nullptr, // pNext
5171 (VkPipelineLayoutCreateFlags)0,
5172 1, // setLayoutCount
5173 &descriptorSetLayout.get(), // pSetLayouts
5174 1u, // pushConstantRangeCount
5175 &pushConstantRange, // pPushConstantRanges
5176 };
5177
5178 flushAlloc(vk, device, buffers[0]->getAllocation());
5179 flushAlloc(vk, device, buffers[1]->getAllocation());
5180 flushAlloc(vk, device, buffers[2]->getAllocation());
5181
5182 const VkPipelineBindPoint bindPoint = VK_PIPELINE_BIND_POINT_COMPUTE;
5183 const Unique<VkShaderModule> shader(createShaderModule(vk, device, m_context.getBinaryCollection().get("test"), 0));
5184 Move<VkPipelineLayout> pipelineLayout = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
5185 Move<VkPipeline> pipeline = createComputePipeline(*pipelineLayout, *shader);
5186 const VkQueue queue = m_context.getUniversalQueue();
5187 Move<VkCommandPool> cmdPool = createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
5188 m_context.getUniversalQueueFamilyIndex());
5189 Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
5190
5191 vk::DescriptorSetUpdateBuilder setUpdateBuilder;
5192 setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(0),
5193 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[0]);
5194 setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(1),
5195 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[1]);
5196 setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(2),
5197 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[2]);
5198 setUpdateBuilder.update(vk, device);
5199
5200 PushConstant pc{/* pcinvocationStride is initialized with 0, the rest of fields as well */};
5201
5202 // compute "maxLoc", the maximum number of locations written
5203 beginCommandBuffer(vk, *cmdBuffer, 0u);
5204 vk.cmdBindDescriptorSets(*cmdBuffer, bindPoint, *pipelineLayout, 0u, 1, &*descriptorSet, 0u, nullptr);
5205 vk.cmdBindPipeline(*cmdBuffer, bindPoint, *pipeline);
5206 vk.cmdPushConstants(*cmdBuffer, *pipelineLayout, m_data.shaderStage, 0, sizeof(pc), &pc);
5207 vk.cmdDispatch(*cmdBuffer, 1, 1, 1);
5208 endCommandBuffer(vk, *cmdBuffer);
5209
5210 submitCommandsAndWait(vk, device, queue, cmdBuffer.get());
5211
5212 invalidateAlloc(vk, device, buffers[1]->getAllocation());
5213 invalidateAlloc(vk, device, buffers[2]->getAllocation());
5214
5215 // Take the max over all invocations. Add one (to make sure no additional writes are done) and multiply by
5216 // the number of invocations
5217 uint32_t newMaxLoc = 0;
5218 for (uint32_t id = 0; id < invocationStride; ++id)
5219 newMaxLoc = de::max(newMaxLoc, ((uint32_t *)ptrs[2])[id]);
5220 shaderMaxLoc = newMaxLoc;
5221 newMaxLoc++;
5222 newMaxLoc *= invocationStride;
5223
5224 // If we need more space, reallocate buffers[1]
5225 if (newMaxLoc > maxLoc)
5226 {
5227 maxLoc = newMaxLoc;
5228 sizes[1] = maxLoc * sizeof(tcu::UVec4);
5229
5230 if (sizes[1] > limits.maxStorageBufferRange)
5231 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
5232
5233 try
5234 {
5235 buffers[1] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
5236 vk, device, allocator,
5237 makeBufferCreateInfo(sizes[1], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
5238 VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
5239 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
5240 }
5241 catch (tcu::ResourceError &)
5242 {
5243 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
5244 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
5245 "Failed device memory allocation " + de::toString(sizes[1]) + " bytes");
5246 }
5247 bufferDescriptors[1] = makeDescriptorBufferInfo(**buffers[1], 0, sizes[1]);
5248 ptrs[1] = buffers[1]->getAllocation().getHostPtr();
5249
5250 vk::DescriptorSetUpdateBuilder setUpdateBuilder2;
5251 setUpdateBuilder2.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(1),
5252 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[1]);
5253 setUpdateBuilder2.update(vk, device);
5254 }
5255
5256 // Clear any writes to buffer[1] during the counting pass
5257 deMemset(ptrs[1], 0, (size_t)sizes[1]);
5258 flushAlloc(vk, device, buffers[1]->getAllocation());
5259 // Clear any writes to buffer[2] during the counting pass
5260 deMemset(ptrs[2], 0, (size_t)sizes[2]);
5261 flushAlloc(vk, device, buffers[2]->getAllocation());
5262
5263 // change invocationStride value in shader
5264 pc.invocationStride = invocationStride;
5265
5266 // run the actual shader
5267 beginCommandBuffer(vk, *cmdBuffer, 0u);
5268 vk.cmdBindDescriptorSets(*cmdBuffer, bindPoint, *pipelineLayout, 0u, 1, &*descriptorSet, 0u, nullptr);
5269 vk.cmdBindPipeline(*cmdBuffer, bindPoint, *pipeline);
5270 vk.cmdPushConstants(*cmdBuffer, *pipelineLayout, m_data.shaderStage, 0, sizeof(pc), &pc);
5271 vk.cmdDispatch(*cmdBuffer, 1, 1, 1);
5272 endCommandBuffer(vk, *cmdBuffer);
5273
5274 submitCommandsAndWait(vk, device, queue, cmdBuffer.get());
5275
5276 invalidateAlloc(vk, device, buffers[1]->getAllocation());
5277
5278 // Simulate execution on the CPU, and compare against the GPU result
5279 try
5280 {
5281 ref.resize(maxLoc, tcu::UVec4());
5282 }
5283 catch (const std::bad_alloc &)
5284 {
5285 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
5286 return tcu::TestStatus(QP_TEST_RESULT_NOT_SUPPORTED,
5287 "Failed system memory allocation " + de::toString(maxLoc * sizeof(uint64_t)) + " bytes");
5288 }
5289
5290 program.execute(m_context.getTestContext().getWatchDog(), false, m_subgroupSize, 0u, invocationStride, ref, log);
5291
5292 const tcu::UVec4 *result = (const tcu::UVec4 *)ptrs[1];
5293
5294 qpTestResult res = calculateAndLogResult(result, ref, invocationStride, m_subgroupSize, shaderMaxLoc);
5295
5296 return tcu::TestStatus(res, qpGetTestResultName(res));
5297 }
5298
calculateAndLogResult(const tcu::UVec4 * result,const std::vector<tcu::UVec4> & ref,uint32_t invocationStride,uint32_t subgroupSize,uint32_t shaderMaxLoc)5299 qpTestResult_e ReconvergenceTestComputeInstance::calculateAndLogResult(const tcu::UVec4 *result,
5300 const std::vector<tcu::UVec4> &ref,
5301 uint32_t invocationStride, uint32_t subgroupSize,
5302 uint32_t shaderMaxLoc)
5303 {
5304 const uint32_t maxLoc = static_cast<uint32_t>(ref.size());
5305 tcu::TestLog &log = m_context.getTestContext().getLog();
5306 qpTestResult res = QP_TEST_RESULT_PASS;
5307 DE_ASSERT(subgroupSize * shaderMaxLoc <= maxLoc);
5308 DE_UNREF(shaderMaxLoc);
5309
5310 uint32_t mismatchCount = 0u;
5311 const uint32_t printMismatchCount = 5u;
5312 if (m_data.testType == TT_MAXIMAL)
5313 {
5314 // With maximal reconvergence, we should expect the output to exactly match
5315 // the reference.
5316 for (uint32_t i = 0; i < maxLoc; ++i)
5317 {
5318 const Ballot resultVal(result[i], subgroupSize);
5319 const Ballot refVal(ref[i], subgroupSize);
5320 if (resultVal != refVal)
5321 {
5322 res = QP_TEST_RESULT_FAIL;
5323 if (mismatchCount++ < printMismatchCount)
5324 {
5325 log << tcu::TestLog::Message << "Mismatch at " << i << "\nexpected: " << resultVal
5326 << "\n got: " << refVal << tcu::TestLog::EndMessage;
5327 }
5328 else
5329 break;
5330 }
5331 }
5332
5333 #if 0 // This log can be large and slow, ifdef it out by default
5334 log << tcu::TestLog::Message << "subgroupSize:" << subgroupSize << ", invocationStride:" << invocationStride << ", maxLoc:" << shaderMaxLoc << tcu::TestLog::EndMessage;
5335 uint32_t invMax = std::min(invocationStride, 50u);
5336 for (uint32_t inv = 0; inv < invMax; ++inv)
5337 {
5338 auto ll = log << tcu::TestLog::Message;
5339 ll << inv << ": ";
5340 for (uint32_t loc = 0; loc < shaderMaxLoc; ++loc)
5341 {
5342 uint64_t entry = result[loc * invocationStride + inv];
5343 ll << de::toString(loc) << ":" << tcu::toHex(entry) << ' ';
5344 }
5345 ll << tcu::TestLog::EndMessage;
5346 }
5347 #endif
5348
5349 if (res != QP_TEST_RESULT_PASS)
5350 {
5351 for (uint32_t i = 0; i < maxLoc; ++i)
5352 {
5353 #if 0
5354 // This log can be large and slow, ifdef it out by default
5355 const Ballot resultVal(result[i], subgroupSize);
5356 const Ballot refVal(ref[i], subgroupSize);
5357 log << tcu::TestLog::Message << "result " << i << "(" << (i / invocationStride) << ", " << (i % invocationStride) << "): " << resultVal << " ref " << refVal << (resultVal != refVal ? " different" : "") << tcu::TestLog::EndMessage;
5358 #endif
5359 }
5360 }
5361 }
5362 else
5363 {
5364 DE_ASSERT(subgroupSize != 0);
5365
5366 Ballot fullMask = subgroupSizeToMask(subgroupSize, 0 /* ignored */);
5367 // For subgroup_uniform_control_flow, we expect any fully converged outputs in the reference
5368 // to have a corresponding fully converged output in the result. So walk through each lane's
5369 // results, and for each reference value of fullMask, find a corresponding result value of
5370 // fullMask where the previous value (OP_STORE) matches. That means these came from the same
5371 // source location.
5372 vector<uint32_t> firstFail(invocationStride, 0);
5373 for (uint32_t lane = 0; lane < invocationStride; ++lane)
5374 {
5375 uint32_t resLoc = lane + invocationStride, refLoc = lane + invocationStride;
5376 while (refLoc < maxLoc)
5377 {
5378 while (refLoc < maxLoc && ref[refLoc] != fullMask)
5379 refLoc += invocationStride;
5380 if (refLoc >= maxLoc)
5381 break;
5382
5383 // For TT_SUCF_ELECT, when the reference result has a full mask, we expect lane 0 to be elected
5384 // (a value of 2) and all other lanes to be not elected (a value of 1). For TT_SUCF_BALLOT, we
5385 // expect a full mask. Search until we find the expected result with a matching store value in
5386 // the previous result.
5387 Ballot expectedResult = m_data.isElect() ? Ballot((lane % m_subgroupSize) == 0 ? 2 : 1) : fullMask;
5388
5389 while (resLoc < maxLoc && !(result[resLoc] == expectedResult &&
5390 result[resLoc - invocationStride] == ref[refLoc - invocationStride]))
5391 resLoc += invocationStride;
5392
5393 // If we didn't find this output in the result, flag it as an error.
5394 if (resLoc >= maxLoc)
5395 {
5396 firstFail[lane] = refLoc;
5397 log << tcu::TestLog::Message << "lane " << lane << " first mismatch at " << firstFail[lane]
5398 << tcu::TestLog::EndMessage;
5399 res = QP_TEST_RESULT_FAIL;
5400 break;
5401 }
5402 refLoc += invocationStride;
5403 resLoc += invocationStride;
5404 }
5405 }
5406
5407 if (res != QP_TEST_RESULT_PASS)
5408 {
5409 for (uint32_t i = 0; i < maxLoc; ++i)
5410 {
5411 // This log can be large and slow, ifdef it out by default
5412 #if 0
5413 log << tcu::TestLog::Message << "result " << i << "(" << (i / invocationStride) << ", " << (i % invocationStride) << "): " << tcu::toHex(result[i]) << " ref " << tcu::toHex(ref[i]) << (i == firstFail[i % invocationStride] ? " first fail" : "") << tcu::TestLog::EndMessage;
5414 #endif
5415 }
5416 }
5417 }
5418
5419 return res;
5420 }
5421
makeRenderPassBeginInfo(const VkRenderPass renderPass,const VkFramebuffer framebuffer)5422 VkRenderPassBeginInfo ReconvergenceTestGraphicsInstance::makeRenderPassBeginInfo(const VkRenderPass renderPass,
5423 const VkFramebuffer framebuffer)
5424 {
5425 static const VkClearValue clearValue{{{0u, 0u, 0u, 0u}}};
5426 return {
5427 VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, // VkStructureType sType;
5428 nullptr, // const void* pNext;
5429 renderPass, // VkRenderPass renderPass;
5430 framebuffer, // VkFramebuffer framebuffer;
5431 makeRect2D(m_data.sizeX, m_data.sizeY), // VkRect2D renderArea;
5432 1u, // uint32_t clearValueCount;
5433 &clearValue // const VkClearValue* pClearValues;
5434 };
5435 }
5436
createVertexBufferAndFlush(uint32_t cellsHorz,uint32_t cellsVert,VkPrimitiveTopology topology)5437 de::MovePtr<BufferWithMemory> ReconvergenceTestGraphicsInstance::createVertexBufferAndFlush(
5438 uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
5439 {
5440 uint32_t vertexCount = cellsHorz * cellsVert;
5441 uint32_t triangleCount = cellsHorz * cellsVert;
5442 switch (topology)
5443 {
5444 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
5445 vertexCount = triangleCount * 3;
5446 break;
5447 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
5448 vertexCount = triangleCount - 1 + 3;
5449 break;
5450 case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
5451 case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
5452 triangleCount = vertexCount - 3 + 1;
5453 break;
5454 default:
5455 DE_ASSERT(0);
5456 }
5457
5458 const DeviceInterface &vk = m_context.getDeviceInterface();
5459 const VkDevice device = m_context.getDevice();
5460 Allocator &allocator = m_context.getDefaultAllocator();
5461 const VkDeviceSize bufferSize = VkDeviceSize(vertexCount) * sizeof(Vertex);
5462 const VkBufferUsageFlags bufferUsage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
5463 const VkBufferCreateInfo createInfo = makeBufferCreateInfo(bufferSize, bufferUsage);
5464 const MemoryRequirement memoryReqs = (MemoryRequirement::HostVisible | MemoryRequirement::Coherent);
5465 de::MovePtr<BufferWithMemory> buffer(new BufferWithMemory(vk, device, allocator, createInfo, memoryReqs));
5466 Allocation &allocation = buffer->getAllocation();
5467 Vertex *vertices = static_cast<Vertex *>(allocation.getHostPtr());
5468
5469 if (VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST == topology)
5470 {
5471 const float stepX = 2.0f / float(cellsHorz);
5472 const float stepY = 2.0f / float(cellsVert);
5473
5474 uint32_t t = 0;
5475 float y = -1.0f;
5476 for (uint32_t h = 0; h < cellsVert; ++h)
5477 {
5478 float x = -1.0f;
5479 const float yy = y + stepY;
5480 for (uint32_t w = 0; w < cellsHorz; ++w)
5481 {
5482 const float xx = x + stepX;
5483
5484 vertices[t++] = {x, yy, 0.f, 0.f};
5485 vertices[t++] = {((xx + x) / 2.f), y, 0.f, 0.f};
5486 vertices[t++] = {xx, ((yy + y) / 2.f), 0.f, 0.f};
5487
5488 x = xx;
5489 }
5490 y = yy;
5491 }
5492 DE_ASSERT(vertexCount == t);
5493 }
5494 else
5495 {
5496 const uint32_t div = static_cast<uint32_t>(ROUNDUP(triangleCount, 2) / 2);
5497 const float step = 2.0f / static_cast<float>(div);
5498
5499 uint32_t t = 0;
5500 float x = -1.0f;
5501 for (uint32_t i = 0; i < div; ++i)
5502 {
5503 const bool last = ((div - i) == 1u);
5504 const float xNext = last ? +1.0f : (x + step);
5505
5506 const Vertex v0{x, +1.0f, 0.0f, 0.0f};
5507 const Vertex v1{xNext, +1.0f, 0.0f, 0.0f};
5508 const Vertex v2{xNext, -1.0f, 0.0f, 0.0f};
5509 const Vertex v3{x, -1.0f, 0.0f, 0.0f};
5510
5511 if (t == 0)
5512 {
5513 vertices[0] = v0;
5514 vertices[1] = v3;
5515 vertices[2] = v1;
5516
5517 t = 3;
5518 }
5519 else
5520 {
5521 vertices[t++] = v1;
5522 }
5523
5524 if (!last || !(triangleCount % 2))
5525 {
5526 vertices[t++] = v2;
5527 }
5528
5529 x += step;
5530 }
5531 DE_ASSERT(vertexCount == t);
5532 }
5533
5534 flushAlloc(vk, device, allocation);
5535 return buffer;
5536 }
generateVertices(const uint32_t primitiveCount,const VkPrimitiveTopology topology,const uint32_t patchSize)5537 std::vector<tcu::Vec4> ReconvergenceTestGraphicsInstance::generateVertices(const uint32_t primitiveCount,
5538 const VkPrimitiveTopology topology,
5539 const uint32_t patchSize)
5540 {
5541 auto cast = [](const float f) -> float { return ((f * 2.0f) - 1.0f); };
5542 auto bestRect = [](const uint32_t c) -> std::pair<uint32_t, uint32_t>
5543 {
5544 uint32_t a = 1;
5545 uint32_t b = 1;
5546 do
5547 {
5548 a = a + 1;
5549 b = (c / a) + ((c % a) ? 1 : 0);
5550 } while (a < b);
5551 return {a, b};
5552 };
5553
5554 uint32_t triangleCount = 0;
5555 uint32_t vertexCount = 0;
5556 switch (topology)
5557 {
5558 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
5559 triangleCount = primitiveCount;
5560 vertexCount = triangleCount + 3 - 1;
5561 break;
5562 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
5563 triangleCount = primitiveCount;
5564 vertexCount = triangleCount * 3;
5565 break;
5566 case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
5567 vertexCount = primitiveCount;
5568 break;
5569 case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
5570 vertexCount = primitiveCount * patchSize;
5571 triangleCount = ROUNDUP(vertexCount, 3) / 3;
5572 break;
5573 default:
5574 DE_ASSERT(false);
5575 }
5576
5577 if (3 == vertexCount)
5578 {
5579 return {{-1.0f, +1.0f, 0.0f, 1.0f}, {0.0f, -1.0f, 0.0f, 1.0f}, {+1.0f, +1.0f, 0.0f, 1.0f}};
5580 }
5581
5582 std::vector<tcu::Vec4> vertices(vertexCount);
5583
5584 if (VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP == topology)
5585 {
5586 uint32_t v = 0;
5587 const uint32_t div = ROUNDUP(triangleCount, 2) / 2;
5588
5589 for (uint32_t i = 0; i < triangleCount && v < vertexCount; ++i)
5590 {
5591 const float xx = cast(float((i / 2) + 1) / float(div));
5592 if (0 == i)
5593 {
5594 const float x = cast(float(i / 2) / float(div));
5595 vertices[v++] = {x, +1.0f, 0.0f, 1.0f};
5596 vertices[v++] = {x, -1.0f, 0.0f, 1.0f};
5597 vertices[v++] = {xx, +1.0f, 0.0f, 1.0f};
5598 }
5599 else
5600 {
5601 if (i % 2)
5602 vertices[v++] = {xx, -1.0f, 0.0f, 1.0f};
5603 else
5604 vertices[v++] = {xx, +1.0f, 0.0f, 1.0f};
5605 }
5606 }
5607 DE_ASSERT(vertexCount == v);
5608 }
5609 else if (VK_PRIMITIVE_TOPOLOGY_POINT_LIST == topology)
5610 {
5611 uint32_t v = 0;
5612 const auto rect = bestRect(vertexCount);
5613
5614 float y = -1.0f;
5615 for (uint32_t h = 0; h < rect.second; ++h)
5616 {
5617 const float yy = cast(float(h + 1) / float(rect.second));
5618 float x = -1.0f;
5619 for (uint32_t w = 0; w < rect.first && v < vertexCount; ++w)
5620 {
5621 const float xx = cast(float(w + 1) / float(rect.first));
5622 vertices[v++] = {((xx - x) / 2.0f), ((yy - y) / 2.0f), 0.0f, 1.0f};
5623 x = xx;
5624 }
5625 y = yy;
5626 }
5627 DE_ASSERT(vertexCount == v);
5628 }
5629 else if (VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST == topology || VK_PRIMITIVE_TOPOLOGY_PATCH_LIST == topology)
5630 {
5631 uint32_t v = 0;
5632 const auto rect = bestRect(triangleCount);
5633
5634 float y = -1.0f;
5635 for (uint32_t h = 0; h < rect.second && v < vertexCount; ++h)
5636 {
5637 const float yy = cast(float(h + 1) / float(rect.second));
5638 float x = -1.0f;
5639 for (uint32_t w = 0; w < rect.first && v < vertexCount; ++w)
5640 {
5641 const float xx = cast(float(w + 1) / float(rect.first));
5642 if (v < vertexCount)
5643 vertices[v++] = {x, yy, 0.f, 0.f};
5644 if (v < vertexCount)
5645 vertices[v++] = {((xx + x) / 2.f), y, 0.f, 0.f};
5646 if (v < vertexCount)
5647 vertices[v++] = {xx, ((yy + y) / 2.f), 0.f, 0.f};
5648 x = xx;
5649 }
5650 y = yy;
5651 }
5652 DE_ASSERT(vertexCount == v);
5653 }
5654
5655 return vertices;
5656 }
5657
createVertexBufferAndFlush(const std::vector<tcu::Vec4> & vertices)5658 de::MovePtr<BufferWithMemory> ReconvergenceTestGraphicsInstance::createVertexBufferAndFlush(
5659 const std::vector<tcu::Vec4> &vertices)
5660 {
5661 const DeviceInterface &vk = m_context.getDeviceInterface();
5662 const VkDevice device = m_context.getDevice();
5663 Allocator &allocator = m_context.getDefaultAllocator();
5664 const VkDeviceSize bufferSize = VkDeviceSize(vertices.size()) * sizeof(tcu::Vec4);
5665 const VkBufferUsageFlags bufferUsage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
5666 const VkBufferCreateInfo createInfo = makeBufferCreateInfo(bufferSize, bufferUsage);
5667 const MemoryRequirement memoryReqs = (MemoryRequirement::HostVisible | MemoryRequirement::Coherent);
5668 de::MovePtr<BufferWithMemory> buffer(new BufferWithMemory(vk, device, allocator, createInfo, memoryReqs));
5669 Allocation &allocation = buffer->getAllocation();
5670 auto bufferRange = makeStdBeginEnd<tcu::Vec4>(allocation.getHostPtr(), (uint32_t)vertices.size());
5671 std::copy(vertices.begin(), vertices.end(), bufferRange.first);
5672 flushAlloc(vk, device, allocation);
5673 return buffer;
5674 }
5675
recordDrawingAndSubmit(const VkCommandBuffer cmdBuffer,const VkPipelineLayout pipelineLayout,const VkPipeline pipeline,const VkDescriptorSet descriptorSet,const PushConstant & pushConstant,const VkRenderPassBeginInfo & renderPassInfo,const VkBuffer vertexBuffer,const uint32_t vertexCount,const VkImage image)5676 void ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit(
5677 const VkCommandBuffer cmdBuffer, const VkPipelineLayout pipelineLayout, const VkPipeline pipeline,
5678 const VkDescriptorSet descriptorSet, const PushConstant &pushConstant, const VkRenderPassBeginInfo &renderPassInfo,
5679 const VkBuffer vertexBuffer, const uint32_t vertexCount, const VkImage image)
5680 {
5681 DE_UNREF(image);
5682 const DeviceInterface &vk = m_context.getDeviceInterface();
5683 const VkDevice device = m_context.getDevice();
5684 const VkQueue queue = m_context.getUniversalQueue();
5685 const VkPipelineBindPoint bindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS;
5686
5687 beginCommandBuffer(vk, cmdBuffer, 0u);
5688 vk.cmdBindDescriptorSets(cmdBuffer, bindPoint, pipelineLayout, 0u, 1u, &descriptorSet, 0u, nullptr);
5689 vk.cmdBindPipeline(cmdBuffer, bindPoint, pipeline);
5690 vk.cmdBindVertexBuffers(cmdBuffer, 0u, 1u, &static_cast<const VkBuffer &>(vertexBuffer),
5691 &static_cast<const VkDeviceSize &>(0u));
5692 vk.cmdPushConstants(cmdBuffer, pipelineLayout, m_data.shaderStage, 0, sizeof(PushConstant), &pushConstant);
5693 vk.cmdBeginRenderPass(cmdBuffer, &renderPassInfo, VK_SUBPASS_CONTENTS_INLINE);
5694 vk.cmdDraw(cmdBuffer, vertexCount, 1u, 0u, 0u);
5695 vk.cmdEndRenderPass(cmdBuffer);
5696 endCommandBuffer(vk, cmdBuffer);
5697
5698 submitCommandsAndWait(vk, device, queue, cmdBuffer);
5699 }
5700
createShaders(void)5701 std::vector<Move<VkShaderModule>> ReconvergenceTestFragmentInstance::createShaders(void)
5702 {
5703 const DeviceInterface &vk = m_context.getDeviceInterface();
5704 const VkDevice device = m_context.getDevice();
5705
5706 Move<VkShaderModule> vertex = createShaderModule(vk, device, m_context.getBinaryCollection().get("vert"), 0);
5707 Move<VkShaderModule> fragment = createShaderModule(vk, device, m_context.getBinaryCollection().get("test"), 0);
5708
5709 // { #vert, #frag, tesc, tese, geom }; if any
5710 std::vector<Move<VkShaderModule>> shaders;
5711 shaders.emplace_back(vertex);
5712 shaders.emplace_back(fragment);
5713
5714 return shaders;
5715 }
5716
calculateAndLogResult(const uint64_t * result,const std::vector<uint64_t> & ref,uint32_t invocationStride,uint32_t subgroupSize,uint32_t shaderMaxLocs,uint32_t primitiveCount,PrintMode printMode)5717 qpTestResult_e ReconvergenceTestGraphicsInstance::calculateAndLogResult(const uint64_t *result,
5718 const std::vector<uint64_t> &ref,
5719 uint32_t invocationStride,
5720 uint32_t subgroupSize, uint32_t shaderMaxLocs,
5721 uint32_t primitiveCount, PrintMode printMode)
5722 {
5723 DE_ASSERT(m_data.testType == TT_MAXIMAL);
5724
5725 const uint32_t maxLoc = static_cast<uint32_t>(ref.size());
5726 tcu::TestLog &log = m_context.getTestContext().getLog();
5727 qpTestResult res = QP_TEST_RESULT_PASS;
5728 uint32_t mismatchCount = 0;
5729
5730 DE_ASSERT(shaderMaxLocs * invocationStride <= maxLoc);
5731
5732 // With maximal reconvergence, we should expect the output to exactly match
5733 // the reference.
5734 for (uint32_t i = 0; i < maxLoc; ++i)
5735 {
5736 const uint64_t resultVal = result[i];
5737 const uint64_t refVal = ref[i];
5738 if (resultVal != refVal)
5739 {
5740 if (1 > mismatchCount++)
5741 {
5742 log << tcu::TestLog::Message << mismatchCount << ": Mismatch at " << i
5743 << ", res: " << tcu::toHex(resultVal) << ", ref: " << tcu::toHex(refVal)
5744 << tcu::TestLog::EndMessage;
5745 }
5746 }
5747 }
5748
5749 if (PrintMode::None != printMode)
5750 {
5751 log << tcu::TestLog::Message << "deviceSubgroupSize: " << m_subgroupSize
5752 << ", testSubgroupSize: " << subgroupSize << ", invocationStride: " << invocationStride
5753 << ", shaderMaxLocs: " << shaderMaxLocs << "\n\t, framebuffer: " << m_data.sizeX << 'x' << m_data.sizeY
5754 << ", primitiveCount: " << primitiveCount << ", PRINT_MODE: "
5755 << ((PrintMode::ThreadsInColumns == printMode) ?
5756 "\"ouLocs in rows & threads in columns\"" :
5757 ((PrintMode::OutLocsInColumns == printMode) ? "\"threads in rows & outLocs in columns\"" : ""))
5758 << " { id:res,ref }\n"
5759 << tcu::TestLog::EndMessage;
5760 }
5761
5762 uint32_t invMax = std::min(invocationStride, 80u);
5763
5764 if (PrintMode::ThreadsInColumns == printMode)
5765 {
5766 for (uint32_t loc = 0; loc < shaderMaxLocs; ++loc)
5767 {
5768 auto l1 = log << tcu::TestLog::Message;
5769 l1 << "loc " << std::setw(3) << loc << ": ";
5770 for (uint32_t inv = 0; inv < invMax; ++inv)
5771 {
5772 uint32_t idx = loc * invocationStride + inv;
5773 DE_ASSERT(idx < maxLoc);
5774 uint64_t resEntry = result[idx];
5775 uint64_t refEntry = ref[idx];
5776 //l1 << de::toString(inv) << ':' << tcu::toHex(resEntry) << ',' << tcu::toHex(refEntry) << ' ';
5777 l1 << std::dec << inv << ':' << std::setw(subgroupSize / 4) << std::hex << resEntry << ','
5778 << std::setw(subgroupSize / 4) << std::hex << refEntry << std::dec << ' ';
5779 }
5780 l1 << std::setw(0) << tcu::TestLog::EndMessage;
5781 }
5782 }
5783 else if (PrintMode::OutLocsInColumns == printMode)
5784 {
5785 for (uint32_t inv = 0; inv < invMax; ++inv)
5786 {
5787 auto l1 = log << tcu::TestLog::Message;
5788 l1 << "res " << std::setw(3) << inv << ": ";
5789 for (uint32_t loc = 0; loc < shaderMaxLocs; ++loc)
5790 {
5791 uint32_t idx = loc * invocationStride + inv;
5792 DE_ASSERT(idx < maxLoc);
5793 uint64_t entry = result[idx];
5794 l1 << de::toString(loc) << ':' << tcu::toHex(entry) << ' ';
5795 }
5796 l1 << std::setw(0) << tcu::TestLog::EndMessage;
5797
5798 auto l2 = log << tcu::TestLog::Message;
5799 l2 << "ref " << std::setw(3) << inv << ": ";
5800 for (uint32_t loc = 0; loc < shaderMaxLocs; ++loc)
5801 {
5802 uint32_t idx = loc * invocationStride + inv;
5803 DE_ASSERT(idx < maxLoc);
5804 uint64_t entry = ref[idx];
5805 l2 << de::toString(loc) << ':' << tcu::toHex(entry) << ' ';
5806 }
5807 l2 << std::setw(0) << tcu::TestLog::EndMessage;
5808 }
5809 }
5810
5811 if (mismatchCount)
5812 {
5813 double mismatchPercentage = 0.0;
5814 std::modf((double)(mismatchCount * 100) / (double)maxLoc, &mismatchPercentage);
5815 log << tcu::TestLog::Message << "Mismatch count " << mismatchCount << " from " << maxLoc << " ("
5816 << mismatchPercentage << "%)" << tcu::TestLog::EndMessage;
5817 res = QP_TEST_RESULT_FAIL;
5818 }
5819
5820 if (res != QP_TEST_RESULT_PASS)
5821 {
5822 for (uint32_t i = 0; i < maxLoc; ++i)
5823 {
5824 // This log can be large and slow, ifdef it out by default
5825 #if 0
5826 log << tcu::TestLog::Message << "result " << i << "(" << (i / invocationStride) << ", " << (i % invocationStride) << "): " << tcu::toHex(result[i]) << " ref " << tcu::toHex(ref[i]) << (result[i] != ref[i] ? " different" : "") << tcu::TestLog::EndMessage;
5827 #endif
5828 }
5829 }
5830
5831 return res;
5832 }
5833
calculateAndLogResultEx(tcu::TestLog & log,const tcu::UVec4 * result,const std::vector<tcu::UVec4> & ref,const uint32_t maxLoc,const Arrangement & a,const PrintMode printMode)5834 qpTestResult_e ReconvergenceTestFragmentInstance::calculateAndLogResultEx(tcu::TestLog &log, const tcu::UVec4 *result,
5835 const std::vector<tcu::UVec4> &ref,
5836 const uint32_t maxLoc, const Arrangement &a,
5837 const PrintMode printMode)
5838 {
5839 DE_UNREF(printMode);
5840
5841 qpTestResult res = QP_TEST_RESULT_PASS;
5842 uint32_t mismatchCount = 0u;
5843 const uint32_t printMismatchCount = 5u;
5844 const FragmentRandomProgram::Arrangement &aa = static_cast<const FragmentRandomProgram::Arrangement &>(a);
5845
5846 // With maximal reconvergence, we should expect the output to exactly match
5847 // the reference.
5848 const uint32_t ballotStoreCount = maxLoc * aa.m_invocationStride * aa.m_primitiveStride;
5849 for (uint32_t i = 0; i < ballotStoreCount; ++i)
5850 {
5851 const Ballot resultVal(result[i], aa.m_subgroupSize);
5852 ;
5853 const Ballot refVal(ref[i], aa.m_subgroupSize);
5854 if (resultVal != refVal)
5855 {
5856 if (mismatchCount++ < printMismatchCount)
5857 {
5858 res = QP_TEST_RESULT_FAIL;
5859 log << tcu::TestLog::Message << "Mismatch at " << i << "\nexpected: " << resultVal
5860 << "\n got: " << refVal << tcu::TestLog::EndMessage;
5861 if (printMode == PrintMode::Console)
5862 {
5863 std::cout << "Mismatch at " << i << "\nexpected: " << resultVal << "\n got: " << refVal
5864 << std::endl;
5865 }
5866 }
5867 }
5868 }
5869
5870 log << tcu::TestLog::Message << "Mismatch count: " << mismatchCount << " from " << ballotStoreCount
5871 << tcu::TestLog::EndMessage;
5872 if (printMode == PrintMode::Console)
5873 {
5874 std::cout << "Mismatch count: " << mismatchCount << " from " << ballotStoreCount << std::endl;
5875 }
5876
5877 return res;
5878 }
5879
makeImageCreateInfo(VkFormat format) const5880 VkImageCreateInfo ReconvergenceTestFragmentInstance::makeImageCreateInfo(VkFormat format) const
5881 {
5882 return {
5883 VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // VkStructureType sType;
5884 nullptr, // const void* pNext;
5885 VkImageCreateFlags(0), // VkImageCreateFlags flags;
5886 VK_IMAGE_TYPE_2D, // VkImageType imageType;
5887 format, // VkFormat format;
5888 {m_data.sizeX, m_data.sizeY, 1u}, // VkExtent3D extent;
5889 1u, // uint32_t mipLevels;
5890 1u, // uint32_t arrayLayers;
5891 VK_SAMPLE_COUNT_1_BIT, // VkSampleCountFlagBits samples;
5892 VK_IMAGE_TILING_OPTIMAL, // VkImageTiling tiling;
5893 VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, // VkImageUsageFlags usage;
5894 VK_SHARING_MODE_EXCLUSIVE, // VkSharingMode sharingMode;
5895 0u, // uint32_t queueFamilyIndexCount;
5896 0u, // const uint32_t* pQueueFamilyIndices;
5897 VK_IMAGE_LAYOUT_UNDEFINED // VkImageLayout initialLayout;
5898 };
5899 }
5900
createVertexBufferAndFlush(uint32_t cellsHorz,uint32_t cellsVert,VkPrimitiveTopology topology)5901 de::MovePtr<BufferWithMemory> ReconvergenceTestFragmentInstance::createVertexBufferAndFlush(
5902 uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
5903 {
5904 // DE_ASSERT(cellsHorz == 2u);
5905 // DE_ASSERT((cellsHorz * 3) == cellsVert);
5906 DE_UNREF(cellsHorz);
5907 DE_UNREF(cellsVert);
5908 DE_UNREF(topology);
5909 DE_ASSERT(topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST);
5910 const std::vector<tcu::Vec4> vertices{{-1.0f, 0.0f, 0.0f, 0.0f}, {-0.5f, -1.0f, 0.0f, 0.0f},
5911 {+1.0f, +1.0f, 0.0f, 0.0f}, {+0.5f, -1.0f, 0.0f, 0.0f},
5912 {+1.0f, 0.0f, 0.0f, 0.0f}, {-1.0f, +1.0f, 0.0f, 0.0f}};
5913 return ReconvergenceTestGraphicsInstance::createVertexBufferAndFlush(vertices);
5914 }
5915
callAuxiliaryShader(tcu::TestStatus & status,uint32_t triangleCount)5916 std::vector<uint32_t> ReconvergenceTestFragmentInstance::callAuxiliaryShader(tcu::TestStatus &status,
5917 uint32_t triangleCount)
5918 {
5919 const DeviceInterface &vk = m_context.getDeviceInterface();
5920 const VkDevice device = m_context.getDevice();
5921 add_ref<Allocator> allocator = m_context.getDefaultAllocator();
5922 const uint32_t queueIndex = m_context.getUniversalQueueFamilyIndex();
5923 //add_ref<tcu::TestLog> log = m_context.getTestContext().getLog();
5924 const uint32_t bufferElems = m_data.sizeX * m_data.sizeY * triangleCount + 3u;
5925 const VkDeviceSize bufferSize = bufferElems * sizeof(uint32_t);
5926
5927 if (bufferSize > m_context.getDeviceProperties().limits.maxStorageBufferRange)
5928 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
5929
5930 const VkBufferCreateInfo createInfo =
5931 vk::makeBufferCreateInfo(bufferSize, (VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
5932 VK_BUFFER_USAGE_TRANSFER_SRC_BIT));
5933 de::MovePtr<BufferWithMemory> buffer;
5934 try
5935 {
5936 buffer = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
5937 vk, device, allocator, createInfo, (MemoryRequirement::HostVisible | MemoryRequirement::Coherent)));
5938 }
5939 catch (tcu::ResourceError &)
5940 {
5941 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
5942 status = tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
5943 "Failed device memory allocation " + de::toString(bufferSize) + " bytes");
5944 return {};
5945 }
5946
5947 const VkDescriptorBufferInfo bufferInfo = makeDescriptorBufferInfo(**buffer, 0, bufferSize);
5948
5949 vk::DescriptorSetLayoutBuilder layoutBuilder;
5950 layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_FRAGMENT_BIT);
5951 vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
5952
5953 vk::DescriptorPoolBuilder poolBuilder;
5954 poolBuilder.addType(vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1u);
5955 vk::Unique<vk::VkDescriptorPool> descriptorPool(
5956 poolBuilder.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
5957
5958 vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
5959
5960 vk::DescriptorSetUpdateBuilder setUpdateBuilder;
5961 setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(0),
5962 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferInfo);
5963 setUpdateBuilder.update(vk, device);
5964
5965 const VkPushConstantRange pushConstantRange{
5966 VK_SHADER_STAGE_FRAGMENT_BIT, // VkShaderStageFlags stageFlags;
5967 0u, // uint32_t offset;
5968 sizeof(PushConstant) // uint32_t size;
5969 };
5970 const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{
5971 VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
5972 nullptr, // pNext
5973 (VkPipelineLayoutCreateFlags)0, // flags
5974 1u, // setLayoutCount
5975 &descriptorSetLayout.get(), // pSetLayouts
5976 1u, // pushConstantRangeCount
5977 &pushConstantRange, // pPushConstantRanges
5978 };
5979
5980 const VkFormat format = VK_FORMAT_R8G8B8A8_UNORM;
5981 const VkImageCreateInfo imageCreateInfo = makeImageCreateInfo(format);
5982 const VkImageSubresourceRange rscRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
5983 de::MovePtr<ImageWithMemory> image(
5984 new ImageWithMemory(vk, device, allocator, imageCreateInfo, vk::MemoryRequirement::Any));
5985 Move<VkImageView> view = makeImageView(vk, device, **image, VK_IMAGE_VIEW_TYPE_2D, format, rscRange);
5986 Move<VkRenderPass> renderPass = makeRenderPass(vk, device, format);
5987 Move<VkFramebuffer> framebuffer =
5988 makeFramebuffer(vk, device, *renderPass, *view, m_data.sizeX, m_data.sizeY, rscRange.layerCount);
5989 const VkRenderPassBeginInfo renderBeginInfo = makeRenderPassBeginInfo(*renderPass, *framebuffer);
5990 auto createAuxShaders = [&]()
5991 {
5992 Shaders shaders;
5993 auto vert = createShaderModule(vk, device, m_context.getBinaryCollection().get("vert"), 0);
5994 auto frag = createShaderModule(vk, device, m_context.getBinaryCollection().get("aux"), 0);
5995 shaders.emplace_back(vert);
5996 shaders.emplace_back(frag);
5997 return shaders;
5998 };
5999 const Shaders shaders = createAuxShaders();
6000 const uint32_t vertexCount = triangleCount * 3u;
6001 de::MovePtr<BufferWithMemory> vertexBuffer =
6002 createVertexBufferAndFlush(triangleCount, vertexCount, VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST);
6003 Move<VkPipelineLayout> pipelineLayout = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
6004 Move<VkPipeline> pipeline = createGraphicsPipeline(*pipelineLayout, *renderPass, m_data.sizeX, m_data.sizeY,
6005 shaders, VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST, 0U);
6006 Move<VkCommandPool> cmdPool =
6007 createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, queueIndex);
6008 Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
6009
6010 PushConstant pc{};
6011 pc.invocationStride = 0u;
6012 pc.width = m_data.sizeX;
6013 pc.height = m_data.sizeY;
6014 pc.primitiveStride = triangleCount;
6015
6016 void *ptr = buffer->getAllocation().getHostPtr();
6017 auto bufferRange = makeStdBeginEnd<uint32_t>(ptr, bufferElems);
6018 std::fill(bufferRange.first, bufferRange.second, 0u);
6019
6020 std::bind(&ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit, this, *cmdBuffer, *pipelineLayout, *pipeline,
6021 *descriptorSet, std::cref(pc), std::cref(renderBeginInfo), **vertexBuffer, vertexCount, **image)();
6022
6023 status = tcu::TestStatus::pass(std::string());
6024 return std::vector<uint32_t>(bufferRange.first, bufferRange.second);
6025 }
6026
iterate(void)6027 tcu::TestStatus ReconvergenceTestFragmentInstance::iterate(void)
6028 {
6029 const DeviceInterface &vk = m_context.getDeviceInterface();
6030 const VkDevice device = m_context.getDevice();
6031 add_ref<Allocator> allocator = m_context.getDefaultAllocator();
6032 const uint32_t queueIndex = m_context.getUniversalQueueFamilyIndex();
6033 add_ref<tcu::TestLog> log = m_context.getTestContext().getLog();
6034 const VkPhysicalDeviceLimits &limits = m_context.getDeviceProperties().limits;
6035 const uint32_t fragmentStride = m_data.sizeX * m_data.sizeY;
6036 const uint32_t primitiveStride = 2;
6037
6038 if (sizeof(PushConstant) > limits.maxPushConstantsSize)
6039 {
6040 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6041 "PushConstant size " + std::to_string(sizeof(PushConstant)) + " exceeds device limit " +
6042 std::to_string(limits.maxPushConstantsSize));
6043 }
6044
6045 tcu::TestStatus auxStatus(QP_TEST_RESULT_FAIL, std::string());
6046 std::vector<uint32_t> primitiveMap = callAuxiliaryShader(auxStatus, primitiveStride);
6047 if (auxStatus.isFail())
6048 return auxStatus;
6049
6050 const uint32_t shaderSubgroupSize = primitiveMap.at(fragmentStride * primitiveStride + 1u);
6051 if (shaderSubgroupSize != m_subgroupSize)
6052 {
6053 return tcu::TestStatus(QP_TEST_RESULT_FAIL,
6054 "The size of the subgroup from the shader (" + std::to_string(shaderSubgroupSize) +
6055 ") is different from the size of the subgroup from the device (" +
6056 std::to_string(m_subgroupSize) + ")");
6057 }
6058 const uint32_t shaderSubgroupStride = primitiveMap.at(fragmentStride * primitiveStride + 0u);
6059 const uint32_t hostSubgroupStride =
6060 FragmentRandomProgram::Arrangement::calcSubgroupCount(primitiveMap, primitiveStride, fragmentStride);
6061 if (shaderSubgroupStride != hostSubgroupStride)
6062 {
6063 return tcu::TestStatus(QP_TEST_RESULT_FAIL,
6064 "The number of subgroups from the shader (" + std::to_string(shaderSubgroupStride) +
6065 ") is different from the number of subgroups calculated manually (" +
6066 std::to_string(hostSubgroupStride) + ")");
6067 }
6068
6069 log << tcu::TestLog::Message << "Subgroup count: " << hostSubgroupStride << tcu::TestLog::EndMessage;
6070 log << tcu::TestLog::Message << "Subgroup size: " << m_subgroupSize << tcu::TestLog::EndMessage;
6071
6072 const VkPrimitiveTopology topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
6073 de::MovePtr<BufferWithMemory> vertexBuffer =
6074 createVertexBufferAndFlush(primitiveStride, (primitiveStride * 3u), topology);
6075
6076 std::vector<tcu::UVec4> ref;
6077 de::MovePtr<FragmentRandomProgram> program = FragmentRandomProgram::create(m_data);
6078 program->generateRandomProgram(m_context.getTestContext().getWatchDog(), log);
6079
6080 const uint32_t simulationMaxLoc = program->execute(m_context.getTestContext().getWatchDog(), true, m_subgroupSize,
6081 fragmentStride, primitiveStride, ref, log, primitiveMap);
6082 log << tcu::TestLog::Message << "simulated maxLoc: " << simulationMaxLoc << tcu::TestLog::EndMessage;
6083 // maxLoc is per-invocation. Add one (to make sure no additional writes are done)
6084 uint32_t maxLoc = simulationMaxLoc;
6085 maxLoc += 1;
6086 maxLoc *= (hostSubgroupStride * 128u * primitiveStride);
6087
6088 constexpr uint32_t bufferCount = 4;
6089 enum Bindings
6090 {
6091 InputA,
6092 OutputBallots,
6093 OutputCounts,
6094 OutputPriMap
6095 };
6096
6097 de::MovePtr<BufferWithMemory> buffers[bufferCount];
6098 vk::VkDescriptorBufferInfo bufferDescriptors[bufferCount];
6099
6100 VkDeviceSize sizes[bufferCount]{
6101 // InputA { uint a[]; } inputA; filled with a[i] := i
6102 (FragmentRandomProgram::conditionIfInvocationStride + 2) * sizeof(uint32_t),
6103
6104 // OutputB { uvec4 b[]; } outputB;
6105 maxLoc * sizeof(tcu::UVec4),
6106
6107 // OutputC { uint loc[]; } outputC;
6108 (hostSubgroupStride * 128u * primitiveStride) * sizeof(uint32_t),
6109
6110 // OutputP { uvec p[]; } outputP; few more for calculating subgroupID, subgroupSize, non-helper and helperinvocations
6111 (fragmentStride * primitiveStride + 16u) * sizeof(uint32_t)};
6112
6113 VkBufferUsageFlags usages[bufferCount]{
6114 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6115 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6116 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6117 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6118 };
6119
6120 // allocate buffers
6121 for (uint32_t i = 0; i < bufferCount; ++i)
6122 {
6123 if (sizes[i] > limits.maxStorageBufferRange)
6124 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
6125
6126 try
6127 {
6128 buffers[i] = de::MovePtr<BufferWithMemory>(
6129 new BufferWithMemory(vk, device, allocator,
6130 makeBufferCreateInfo(sizes[i], usages[i] | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
6131 VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
6132 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
6133 }
6134 catch (tcu::ResourceError &)
6135 {
6136 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6137 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6138 "Failed device memory allocation " + de::toString(sizes[i]) + " bytes");
6139 }
6140 bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, sizes[i]);
6141 }
6142
6143 // get raw pointers to previously allocated buffers
6144 void *ptrs[bufferCount];
6145 for (uint32_t i = 0; i < bufferCount; ++i)
6146 {
6147 ptrs[i] = buffers[i]->getAllocation().getHostPtr();
6148 }
6149
6150 // populate buffers with their destination
6151 {
6152 auto rangeBufferA =
6153 makeStdBeginEnd<uint32_t>(ptrs[InputA], static_cast<uint32_t>(sizes[InputA] / sizeof(uint32_t)));
6154 std::iota(rangeBufferA.first, rangeBufferA.second, 0u);
6155 }
6156 deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
6157 deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
6158 deMemset(ptrs[OutputPriMap], 0, (size_t)sizes[OutputPriMap]);
6159
6160 // (...) and flush them to the GPU
6161 for (uint32_t i = 0; i < bufferCount; ++i)
6162 {
6163 flushAlloc(vk, device, buffers[i]->getAllocation());
6164 }
6165
6166 VkDescriptorType descTypes[bufferCount]{
6167 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6168 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6169 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6170 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6171 };
6172
6173 vk::DescriptorSetLayoutBuilder layoutBuilder;
6174 for (uint32_t i = 0; i < bufferCount; ++i)
6175 {
6176 layoutBuilder.addSingleBinding(descTypes[i], m_data.shaderStage);
6177 }
6178 vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
6179
6180 vk::DescriptorPoolBuilder poolBuilder;
6181 for (uint32_t i = 0; i < bufferCount; ++i)
6182 {
6183 poolBuilder.addType(descTypes[i], 1);
6184 }
6185 vk::Unique<vk::VkDescriptorPool> descriptorPool(
6186 poolBuilder.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
6187 vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
6188
6189 vk::DescriptorSetUpdateBuilder setUpdateBuilder;
6190 for (uint32_t i = 0; i < bufferCount; ++i)
6191 {
6192 setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(i), descTypes[i],
6193 &bufferDescriptors[i]);
6194 }
6195 setUpdateBuilder.update(vk, device);
6196
6197 const VkPushConstantRange pushConstantRange{
6198 (VkShaderStageFlags)m_data.shaderStage, // VkShaderStageFlags stageFlags;
6199 0u, // uint32_t offset;
6200 sizeof(PushConstant) // uint32_t size;
6201 };
6202
6203 const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{
6204 VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
6205 nullptr, // pNext
6206 (VkPipelineLayoutCreateFlags)0, // flags
6207 1u, // setLayoutCount
6208 &descriptorSetLayout.get(), // pSetLayouts
6209 1u, // pushConstantRangeCount
6210 &pushConstantRange, // pPushConstantRanges
6211 };
6212
6213 const VkFormat format = VK_FORMAT_R8G8B8A8_UNORM;
6214 const VkImageCreateInfo imageCreateInfo = makeImageCreateInfo(format);
6215 const VkImageSubresourceRange rscRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
6216 de::MovePtr<ImageWithMemory> image(
6217 new ImageWithMemory(vk, device, allocator, imageCreateInfo, vk::MemoryRequirement::Any));
6218 Move<VkImageView> view = makeImageView(vk, device, **image, VK_IMAGE_VIEW_TYPE_2D, format, rscRange);
6219 Move<VkRenderPass> renderPass = makeRenderPass(vk, device, format);
6220 Move<VkFramebuffer> framebuffer =
6221 makeFramebuffer(vk, device, *renderPass, *view, m_data.sizeX, m_data.sizeY, rscRange.layerCount);
6222 const VkRenderPassBeginInfo renderBeginInfo = makeRenderPassBeginInfo(*renderPass, *framebuffer);
6223 const Shaders shaders = createShaders();
6224 Move<VkPipelineLayout> pipelineLayout = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
6225 Move<VkPipeline> pipeline =
6226 createGraphicsPipeline(*pipelineLayout, *renderPass, m_data.sizeX, m_data.sizeY, shaders, topology, 0U);
6227 Move<VkCommandPool> cmdPool =
6228 createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, queueIndex);
6229 Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
6230
6231 PushConstant pc{};
6232 pc.width = m_data.sizeX;
6233 pc.height = m_data.sizeY;
6234 pc.primitiveStride = primitiveStride;
6235 pc.invocationStride = 0u;
6236 pc.subgroupStride = hostSubgroupStride;
6237 pc.enableInvocationIndex = VK_FALSE;
6238
6239 auto callRecordDrawingAndSubmit = std::bind(
6240 &ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit, this, *cmdBuffer, *pipelineLayout, *pipeline,
6241 *descriptorSet, std::cref(pc), std::cref(renderBeginInfo), **vertexBuffer, (primitiveStride * 3u), **image);
6242
6243 // compute "maxLoc", which is a potential maximum number of locations written
6244 callRecordDrawingAndSubmit();
6245
6246 // Take the maximum of "maxLoc" over all invocations.
6247 invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6248 auto rangeLoc = makeStdBeginEnd<const uint32_t>(ptrs[OutputCounts], (hostSubgroupStride * 128u * primitiveStride));
6249 const uint32_t computedShaderMaxLoc = *max_element(rangeLoc.first, rangeLoc.second);
6250 log << tcu::TestLog::Message << "Computed maxLoc in the shader: " << computedShaderMaxLoc
6251 << tcu::TestLog::EndMessage;
6252
6253 if (computedShaderMaxLoc >= FragmentRandomProgram::experimentalOutLocSize)
6254 {
6255 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6256 "Calculated maxLoc from a shader (which is " + de::toString(computedShaderMaxLoc) +
6257 ") "
6258 "exceeds BALLOT_STACK_SIZE (which is " +
6259 de::toString(FragmentRandomProgram::experimentalOutLocSize) +
6260 ").\n"
6261 "To repair this just increment slightly a " MAKETEXT(
6262 FragmentRandomProgram::experimentalOutLocSize) " "
6263 "in line " +
6264 de::toString(BALLOT_STACK_SIZE_DEFVAL_LINE));
6265 }
6266
6267 // If we need more space, reallocate OutputB::b[]
6268 if (computedShaderMaxLoc != simulationMaxLoc)
6269 {
6270 // Add one (to make sure no additional writes are done) and multiply by
6271 // the number of invocations and current primitive count
6272 maxLoc = (std::max(computedShaderMaxLoc, simulationMaxLoc) + 1) * (hostSubgroupStride * 128u * primitiveStride);
6273 sizes[OutputBallots] = maxLoc * sizeof(tcu::UVec4);
6274
6275 if (sizes[OutputBallots] > limits.maxStorageBufferRange)
6276 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
6277
6278 try
6279 {
6280 buffers[OutputBallots] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
6281 vk, device, allocator,
6282 makeBufferCreateInfo(sizes[OutputBallots], usages[OutputBallots] | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
6283 VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
6284 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
6285 }
6286 catch (tcu::ResourceError &)
6287 {
6288 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6289 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6290 "Failed device memory allocation " + de::toString(sizes[OutputBallots]) + " bytes");
6291 }
6292 bufferDescriptors[OutputBallots] = makeDescriptorBufferInfo(**buffers[OutputBallots], 0, sizes[OutputBallots]);
6293 ptrs[OutputBallots] = buffers[OutputBallots]->getAllocation().getHostPtr();
6294
6295 vk::DescriptorSetUpdateBuilder setUpdateBuilder2;
6296 setUpdateBuilder2.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(OutputBallots),
6297 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[OutputBallots]);
6298 setUpdateBuilder2.update(vk, device);
6299 }
6300
6301 // Clear any writes to ballots/stores OutputB::b[] aka buffer[OutputBallots] during the counting pass
6302 // Note that its size would may change since the first memory allocation
6303 deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
6304 // Clear any writes to counting OutputC::loc[] aka buffer[OutputCounts] during the counting pass
6305 deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
6306 // Clear any writes to counting OutputP::p[] aka buffer[OutputPriMap] during the counting pass
6307 deMemset(ptrs[OutputPriMap], 0, (size_t)sizes[OutputPriMap]);
6308
6309 // flush them all to the GPU
6310 flushAlloc(vk, device, buffers[OutputBallots]->getAllocation());
6311 flushAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6312 flushAlloc(vk, device, buffers[OutputPriMap]->getAllocation());
6313
6314 // run the actual shader with updated PushConstant
6315 pc.enableInvocationIndex = VK_TRUE;
6316 callRecordDrawingAndSubmit();
6317
6318 invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6319 invalidateAlloc(vk, device, buffers[OutputBallots]->getAllocation());
6320 invalidateAlloc(vk, device, buffers[OutputPriMap]->getAllocation());
6321
6322 // Simulate execution on the CPU, and compare against the GPU result
6323 try
6324 {
6325 ref.resize(maxLoc, tcu::UVec4());
6326 }
6327 catch (const std::bad_alloc &)
6328 {
6329 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6330 return tcu::TestStatus(QP_TEST_RESULT_NOT_SUPPORTED,
6331 "Failed system memory allocation " + de::toString(maxLoc * sizeof(uint64_t)) + " bytes");
6332 }
6333
6334 std::fill(primitiveMap.begin(), primitiveMap.end(), 0);
6335 auto primitiveMapRange = makeStdBeginEnd<const uint32_t>(ptrs[OutputPriMap], (fragmentStride * primitiveStride));
6336 std::copy(primitiveMapRange.first, primitiveMapRange.second, primitiveMap.begin());
6337
6338 const FragmentRandomProgram::Arrangement a(primitiveMap, m_data.sizeX, m_data.sizeY, m_subgroupSize,
6339 primitiveStride);
6340 const tcu::UVec4 *ballots = static_cast<tcu::UVec4 *>(ptrs[OutputBallots]);
6341
6342 program->execute(m_context.getTestContext().getWatchDog(), false, m_subgroupSize, fragmentStride, primitiveStride,
6343 ref, log, primitiveMap, ballots);
6344
6345 const uint32_t finalMaxLoc = std::max(computedShaderMaxLoc, simulationMaxLoc);
6346 const qpTestResult res = calculateAndLogResultEx(log, ballots, ref, finalMaxLoc, a, PrintMode::None);
6347
6348 return tcu::TestStatus(res, qpGetTestResultName(res));
6349 }
6350
createVertexBufferAndFlush(uint32_t cellsHorz,uint32_t cellsVert,VkPrimitiveTopology topology)6351 de::MovePtr<BufferWithMemory> ReconvergenceTestVertexInstance::createVertexBufferAndFlush(uint32_t cellsHorz,
6352 uint32_t cellsVert,
6353 VkPrimitiveTopology topology)
6354 {
6355 DE_UNREF(topology);
6356 DE_ASSERT(VK_PRIMITIVE_TOPOLOGY_POINT_LIST == topology);
6357 const std::vector<tcu::Vec4> vertices =
6358 VertexRandomProgram::Arrangement::generatePrimitives(cellsHorz, cellsVert, VertexRandomProgram::fillPercentage);
6359 return ReconvergenceTestGraphicsInstance::createVertexBufferAndFlush(vertices);
6360 }
6361
createShaders(void)6362 std::vector<Move<VkShaderModule>> ReconvergenceTestVertexInstance::createShaders(void)
6363 {
6364 const DeviceInterface &vk = m_context.getDeviceInterface();
6365 const VkDevice device = m_context.getDevice();
6366
6367 Move<VkShaderModule> vertex = createShaderModule(vk, device, m_context.getBinaryCollection().get("test"), 0);
6368 Move<VkShaderModule> fragment = createShaderModule(vk, device, m_context.getBinaryCollection().get("frag"), 0);
6369
6370 // { #vert, #frag, #tesc, tese, geom }; if any
6371 std::vector<Move<VkShaderModule>> shaders;
6372 shaders.emplace_back(vertex);
6373 shaders.emplace_back(fragment);
6374
6375 return shaders;
6376 }
6377
iterate(void)6378 tcu::TestStatus ReconvergenceTestVertexInstance::iterate(void)
6379 {
6380 const VkPhysicalDeviceLimits &limits = m_context.getDeviceProperties().limits;
6381 if (sizeof(PushConstant) > limits.maxPushConstantsSize)
6382 {
6383 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6384 "PushConstant size " + std::to_string(sizeof(PushConstant)) + " exceeds device limit " +
6385 std::to_string(limits.maxPushConstantsSize));
6386 }
6387
6388 const DeviceInterface &vk = m_context.getDeviceInterface();
6389 const VkDevice device = m_context.getDevice();
6390 Allocator &allocator = m_context.getDefaultAllocator();
6391 const uint32_t queueIndex = m_context.getUniversalQueueFamilyIndex();
6392 add_ref<tcu::TestLog> log = m_context.getTestContext().getLog();
6393 const VkPrimitiveTopology topology = VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
6394 const uint32_t fragmentStride = uint32_t(m_data.sizeX * m_data.sizeY);
6395 const uint32_t invocationStride =
6396 static_cast<uint32_t>(VertexRandomProgram::Arrangement::generatePrimitives(m_data.sizeX, m_data.sizeY,
6397 VertexRandomProgram::fillPercentage)
6398 .size());
6399
6400 de::MovePtr<VertexRandomProgram> program(new VertexRandomProgram(m_data));
6401 program->generateRandomProgram(m_context.getTestContext().getWatchDog(), log);
6402
6403 // simulate content of outputP buffer
6404 std::vector<uint32_t> outputP =
6405 VertexRandomProgram::Arrangement::generateOutputPvector(m_subgroupSize, invocationStride);
6406
6407 std::vector<tcu::UVec4> ref;
6408 const uint32_t hostMaxLoc = program->execute(m_context.getTestContext().getWatchDog(), true, m_subgroupSize,
6409 fragmentStride, invocationStride, ref, log, outputP, nullptr);
6410 log << tcu::TestLog::Message << "Rendering area : " << tcu::UVec2(m_data.sizeX, m_data.sizeY)
6411 << tcu::TestLog::EndMessage;
6412 log << tcu::TestLog::Message << "invocationStride: " << invocationStride << tcu::TestLog::EndMessage;
6413 log << tcu::TestLog::Message << "Simulated maxLoc: " << hostMaxLoc << tcu::TestLog::EndMessage;
6414 // maxLoc is per-invocation. Add one (to make sure no additional writes are done).
6415 uint32_t maxLoc = hostMaxLoc;
6416 maxLoc += 1;
6417 maxLoc *= invocationStride;
6418
6419 constexpr uint32_t bufferCount = 4u;
6420 enum Bindings
6421 {
6422 InputA,
6423 OutputBallots,
6424 OutputCounts,
6425 OutputPrimitives
6426 };
6427
6428 de::MovePtr<BufferWithMemory> buffers[bufferCount];
6429 vk::VkDescriptorBufferInfo bufferDescriptors[bufferCount];
6430
6431 uint32_t counts[bufferCount]{// InputA { uint a[]; } inputA;
6432 uint32_t(m_data.sizeX * m_data.sizeY),
6433 // OutputB { uvec2 b[]; } outputB;
6434 maxLoc,
6435 // OutputC { uint loc[]; } outputC;
6436 invocationStride,
6437 // OutputP { uint p[]; } outputP;
6438 uint32_t(outputP.size())};
6439
6440 VkDeviceSize sizes[bufferCount]{// InputA { uint a[]; } inputA;
6441 counts[InputA] * sizeof(uint32_t),
6442 // OutputB { uvec2 b[]; } outputB;
6443 counts[OutputBallots] * sizeof(tcu::UVec4),
6444 // OutputC { uint loc[]; } outputC;
6445 counts[OutputCounts] * sizeof(uint32_t),
6446 // OutputP { uint p[]; } outputP;
6447 counts[OutputPrimitives] * sizeof(uint32_t)};
6448
6449 const VkBufferUsageFlags cmnUsages = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
6450 VkBufferUsageFlags usages[bufferCount]{
6451 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6452 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6453 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6454 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6455 };
6456
6457 // allocate buffers
6458 for (uint32_t i = 0; i < bufferCount; ++i)
6459 {
6460 if (sizes[i] > limits.maxStorageBufferRange)
6461 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
6462
6463 try
6464 {
6465 buffers[i] = de::MovePtr<BufferWithMemory>(
6466 new BufferWithMemory(vk, device, allocator, makeBufferCreateInfo(sizes[i], usages[i] | cmnUsages),
6467 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
6468 }
6469 catch (tcu::ResourceError &)
6470 {
6471 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6472 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6473 "Failed device memory allocation " + de::toString(sizes[i]) + " bytes");
6474 }
6475 bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, sizes[i]);
6476 }
6477
6478 // get raw pointers to previously allocated buffers
6479 void *ptrs[bufferCount];
6480 for (uint32_t i = 0; i < bufferCount; ++i)
6481 {
6482 ptrs[i] = buffers[i]->getAllocation().getHostPtr();
6483 }
6484
6485 // populate buffers with their destination
6486 {
6487 auto rangeBufferA = makeStdBeginEnd<uint32_t>(ptrs[InputA], counts[InputA]);
6488 std::iota(rangeBufferA.first, rangeBufferA.second, 0u);
6489 }
6490 deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
6491 deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
6492 deMemset(ptrs[OutputPrimitives], 0, (size_t)sizes[OutputPrimitives]);
6493
6494 // (...) and flush them to the GPU
6495 for (uint32_t i = 0; i < bufferCount; ++i)
6496 {
6497 flushAlloc(vk, device, buffers[i]->getAllocation());
6498 }
6499
6500 VkDescriptorType descTypes[bufferCount]{VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6501 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER};
6502
6503 vk::DescriptorSetLayoutBuilder layoutBuilder;
6504 for (uint32_t i = 0; i < bufferCount; ++i)
6505 {
6506 layoutBuilder.addSingleBinding(descTypes[i], m_data.shaderStage);
6507 }
6508 vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
6509
6510 vk::DescriptorPoolBuilder poolBuilder;
6511 for (uint32_t i = 0; i < bufferCount; ++i)
6512 {
6513 poolBuilder.addType(descTypes[i], 1);
6514 }
6515 vk::Unique<vk::VkDescriptorPool> descriptorPool(
6516 poolBuilder.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
6517 vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
6518
6519 vk::DescriptorSetUpdateBuilder setUpdateBuilder;
6520 for (uint32_t i = 0; i < bufferCount; ++i)
6521 {
6522 setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(i), descTypes[i],
6523 &bufferDescriptors[i]);
6524 }
6525 setUpdateBuilder.update(vk, device);
6526
6527 const VkPushConstantRange pushConstantRange{
6528 (VkShaderStageFlags)m_data.shaderStage, // VkShaderStageFlags stageFlags;
6529 0u, // uint32_t offset;
6530 sizeof(PushConstant) // uint32_t size;
6531 };
6532
6533 const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{
6534 VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
6535 nullptr, // pNext
6536 (VkPipelineLayoutCreateFlags)0, // flags
6537 1u, // setLayoutCount
6538 &descriptorSetLayout.get(), // pSetLayouts
6539 1u, // pushConstantRangeCount
6540 &pushConstantRange, // pPushConstantRanges
6541 };
6542
6543 const uint32_t imageWidth = m_data.sizeX;
6544 const uint32_t imageHeight = m_data.sizeY;
6545 const VkFormat format = VK_FORMAT_R8G8B8A8_UNORM;
6546 const VkImageCreateInfo imageCreateInfo{
6547 VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // VkStructureType sType;
6548 nullptr, // const void* pNext;
6549 VkImageCreateFlags(0), // VkImageCreateFlags flags;
6550 VK_IMAGE_TYPE_2D, // VkImageType imageType;
6551 format, // VkFormat format;
6552 {imageWidth, imageHeight, 1u}, // VkExtent3D extent;
6553 1u, // uint32_t mipLevels;
6554 1u, // uint32_t arrayLayers;
6555 VK_SAMPLE_COUNT_1_BIT, // VkSampleCountFlagBits samples;
6556 VK_IMAGE_TILING_OPTIMAL, // VkImageTiling tiling;
6557 VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, // VkImageUsageFlags usage;
6558 VK_SHARING_MODE_EXCLUSIVE, // VkSharingMode sharingMode;
6559 0u, // uint32_t queueFamilyIndexCount;
6560 0u, // const uint32_t* pQueueFamilyIndices;
6561 VK_IMAGE_LAYOUT_UNDEFINED // VkImageLayout initialLayout;
6562 };
6563 const VkImageSubresourceRange rscRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
6564 de::MovePtr<ImageWithMemory> image(
6565 new ImageWithMemory(vk, device, allocator, imageCreateInfo, vk::MemoryRequirement::Any));
6566 Move<VkImageView> view = makeImageView(vk, device, **image, VK_IMAGE_VIEW_TYPE_2D, format, rscRange);
6567 Move<VkRenderPass> renderPass = makeRenderPass(vk, device, format);
6568 Move<VkFramebuffer> framebuffer =
6569 makeFramebuffer(vk, device, *renderPass, *view, m_data.sizeX, m_data.sizeY, rscRange.layerCount);
6570 de::MovePtr<BufferWithMemory> vertexBuffer = createVertexBufferAndFlush(m_data.sizeX, m_data.sizeY, topology);
6571 const VkRenderPassBeginInfo renderBeginInfo = makeRenderPassBeginInfo(*renderPass, *framebuffer);
6572 const Shaders shaders = createShaders();
6573 Move<VkPipelineLayout> pipelineLayout = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
6574 Move<VkPipeline> pipeline =
6575 createGraphicsPipeline(*pipelineLayout, *renderPass, imageWidth, imageHeight, shaders, topology, 0u);
6576 Move<VkCommandPool> cmdPool =
6577 createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, queueIndex);
6578 Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
6579
6580 PushConstant pc{};
6581 pc.invocationStride = invocationStride;
6582 pc.width = m_data.sizeX;
6583 pc.height = m_data.sizeY;
6584 pc.enableInvocationIndex = VK_FALSE;
6585
6586 auto callRecordDrawingAndSubmit = std::bind(&ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit, this,
6587 *cmdBuffer, *pipelineLayout, *pipeline, *descriptorSet, std::cref(pc),
6588 std::cref(renderBeginInfo), **vertexBuffer, invocationStride, **image);
6589
6590 // compute "maxLoc", which is a potential maximum number of locations written
6591 callRecordDrawingAndSubmit();
6592
6593 // Take the maximum of "maxLoc" over all invocations.
6594 invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6595 auto rangeLoc = makeStdBeginEnd<const uint32_t>(ptrs[OutputCounts], counts[OutputCounts]);
6596 const uint32_t shaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
6597 log << tcu::TestLog::Message << "Computed maxLoc in shader: " << shaderMaxLoc << tcu::TestLog::EndMessage;
6598
6599 // If we need more space, reallocate OutputB::b[] aka buffers[1]
6600 if (shaderMaxLoc != hostMaxLoc)
6601 {
6602 // Add one (to make sure no additional writes are done) and multiply by
6603 // the number of invocations and current primitive count
6604 maxLoc = (std::max(shaderMaxLoc, hostMaxLoc) + 1u) * invocationStride;
6605 counts[OutputBallots] = maxLoc;
6606 sizes[OutputBallots] = counts[OutputBallots] * sizeof(tcu::UVec4);
6607
6608 if (sizes[OutputBallots] > limits.maxStorageBufferRange)
6609 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
6610
6611 try
6612 {
6613 buffers[OutputBallots] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
6614 vk, device, allocator, makeBufferCreateInfo(sizes[OutputBallots], usages[OutputBallots] | cmnUsages),
6615 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
6616 }
6617 catch (tcu::ResourceError &)
6618 {
6619 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6620 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6621 "Failed device memory allocation " + de::toString(sizes[OutputBallots]) + " bytes");
6622 }
6623 bufferDescriptors[OutputBallots] = makeDescriptorBufferInfo(**buffers[OutputBallots], 0, sizes[OutputBallots]);
6624 ptrs[OutputBallots] = buffers[OutputBallots]->getAllocation().getHostPtr();
6625
6626 vk::DescriptorSetUpdateBuilder setUpdateBuilder2;
6627 setUpdateBuilder2.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(OutputBallots),
6628 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[OutputBallots]);
6629 setUpdateBuilder2.update(vk, device);
6630 }
6631
6632 // Clear any writes to ballots/stores OutputB::b[] aka buffer[1] during the counting pass
6633 // Note that its size would may change since the first memory allocation
6634 deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
6635 deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
6636 deMemset(ptrs[OutputPrimitives], 0, (size_t)sizes[OutputPrimitives]);
6637
6638 // flush them all to the GPU
6639 flushAlloc(vk, device, buffers[OutputBallots]->getAllocation());
6640 flushAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6641 flushAlloc(vk, device, buffers[OutputPrimitives]->getAllocation());
6642
6643 // run the actual shader with updated PushConstant
6644 pc.enableInvocationIndex = VK_TRUE;
6645 callRecordDrawingAndSubmit();
6646
6647 invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6648 const uint32_t finalShaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
6649 log << tcu::TestLog::Message << "Final maxLoc from shader: " << finalShaderMaxLoc << tcu::TestLog::EndMessage;
6650 if (finalShaderMaxLoc != shaderMaxLoc)
6651 {
6652 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6653 "maxLoc differs across shader invocations, expected: " + de::toString(shaderMaxLoc) +
6654 " got: " + de::toString(finalShaderMaxLoc));
6655 }
6656
6657 invalidateAlloc(vk, device, buffers[OutputBallots]->getAllocation());
6658 const tcu::UVec4 *ballots = static_cast<tcu::UVec4 *>(ptrs[OutputBallots]);
6659
6660 invalidateAlloc(vk, device, buffers[OutputPrimitives]->getAllocation());
6661 auto outputPrange = makeStdBeginEnd<uint32_t>(ptrs[OutputPrimitives], counts[OutputPrimitives]);
6662 std::copy(outputPrange.first, outputPrange.second, outputP.begin());
6663
6664 try
6665 {
6666 ref.resize(counts[OutputBallots], tcu::UVec4(0u, 0u, 0u, 0u));
6667 }
6668 catch (const std::bad_alloc &)
6669 {
6670 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6671 return tcu::TestStatus(QP_TEST_RESULT_NOT_SUPPORTED,
6672 "Failed system memory allocation " + de::toString(sizes[OutputBallots]) + " bytes");
6673 }
6674
6675 // Simulate execution on the CPU, and compare against the GPU result
6676 const uint32_t finalHostMaxLoc = program->execute(m_context.getTestContext().getWatchDog(), false, m_subgroupSize,
6677 fragmentStride, invocationStride, ref, log, outputP, ballots);
6678
6679 const qpTestResult res = calculateAndLogResultEx(log, ballots, ref, finalHostMaxLoc, PrintMode::None);
6680
6681 return tcu::TestStatus(res, qpGetTestResultName(res));
6682 }
6683
calculateAndLogResultEx(add_ref<tcu::TestLog> log,const tcu::UVec4 * result,const std::vector<tcu::UVec4> & ref,const uint32_t maxLoc,const PrintMode printMode)6684 qpTestResult_e ReconvergenceTestVertexInstance::calculateAndLogResultEx(add_ref<tcu::TestLog> log,
6685 const tcu::UVec4 *result,
6686 const std::vector<tcu::UVec4> &ref,
6687 const uint32_t maxLoc,
6688 const PrintMode printMode)
6689 {
6690 DE_UNREF(maxLoc);
6691 DE_UNREF(printMode);
6692
6693 qpTestResult res = QP_TEST_RESULT_PASS;
6694 uint32_t mismatchCount = 0u;
6695 const uint32_t printMismatchCount = 5u;
6696
6697 // With maximal reconvergence, we should expect the output to exactly match the reference.
6698 const uint32_t ballotStoreCount = static_cast<uint32_t>(ref.size());
6699 for (uint32_t i = 0; i < ballotStoreCount; ++i)
6700 {
6701 const Ballot resultVal(result[i], m_subgroupSize);
6702 const Ballot refVal(ref.at(i), m_subgroupSize);
6703 if (resultVal != refVal)
6704 {
6705 if (mismatchCount++ < printMismatchCount)
6706 {
6707 res = QP_TEST_RESULT_FAIL;
6708 log << tcu::TestLog::Message << "Mismatch at " << i << "\nexpected: " << resultVal
6709 << "\n got: " << refVal << tcu::TestLog::EndMessage;
6710 if (printMode == PrintMode::Console)
6711 {
6712 std::cout << "Mismatch at " << i << "\nexpected: " << resultVal << "\n got: " << refVal
6713 << std::endl;
6714 }
6715 }
6716 }
6717 }
6718
6719 log << tcu::TestLog::Message << "Mismatch count: " << mismatchCount << " from " << ballotStoreCount
6720 << tcu::TestLog::EndMessage;
6721 if (printMode == PrintMode::Console)
6722 {
6723 std::cout << "Mismatch count: " << mismatchCount << " from " << ballotStoreCount << std::endl;
6724 }
6725
6726 return res;
6727 }
6728
createShaders(void)6729 std::vector<Move<VkShaderModule>> ReconvergenceTestTessCtrlInstance::createShaders(void)
6730 {
6731 const DeviceInterface &vk = m_context.getDeviceInterface();
6732 const VkDevice device = m_context.getDevice();
6733
6734 Move<VkShaderModule> vertex = createShaderModule(vk, device, m_context.getBinaryCollection().get("vert"));
6735 Move<VkShaderModule> fragment = createShaderModule(vk, device, m_context.getBinaryCollection().get("frag"));
6736 Move<VkShaderModule> control = createShaderModule(vk, device, m_context.getBinaryCollection().get("test"));
6737 Move<VkShaderModule> evaluation = createShaderModule(vk, device, m_context.getBinaryCollection().get("tese"));
6738
6739 // { #vert, #frag, #tesc, #tese, geom }; if any
6740 std::vector<Move<VkShaderModule>> shaders;
6741 shaders.emplace_back(vertex);
6742 shaders.emplace_back(fragment);
6743 shaders.emplace_back(control);
6744 shaders.emplace_back(evaluation);
6745
6746 return shaders;
6747 }
6748
iterate(void)6749 tcu::TestStatus ReconvergenceTestTessCtrlInstance::iterate(void)
6750 {
6751 const DeviceInterface &vk = m_context.getDeviceInterface();
6752 const VkDevice device = m_context.getDevice();
6753 Allocator &allocator = m_context.getDefaultAllocator();
6754 const uint32_t queueIndex = m_context.getUniversalQueueFamilyIndex();
6755 add_ref<tcu::TestLog> log = m_context.getTestContext().getLog();
6756
6757 if (m_subgroupSize < TessCtrlRandomProgram::minSubgroupSize || m_subgroupSize > 64)
6758 {
6759 std::stringstream str;
6760 str << "Subgroup size less than " << TessCtrlRandomProgram::minSubgroupSize
6761 << " or greater than 64 not handled.";
6762 str.flush();
6763 TCU_THROW(TestError, str.str());
6764 }
6765
6766 deRandom rnd;
6767 deRandom_init(&rnd, m_data.seed);
6768
6769 vk::VkPhysicalDeviceProperties2 properties2;
6770 deMemset(&properties2, 0, sizeof(properties2));
6771 properties2.sType = vk::VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
6772 m_context.getInstanceInterface().getPhysicalDeviceProperties2(m_context.getPhysicalDevice(), &properties2);
6773 const VkPhysicalDeviceLimits &limits = properties2.properties.limits;
6774
6775 const uint32_t patchControlPoints = 1;
6776 const uint32_t vertexCount =
6777 (m_subgroupSize / TessCtrlRandomProgram::minSubgroupSize) * patchControlPoints * m_data.sizeX;
6778 const uint32_t primitiveStride = vertexCount / patchControlPoints;
6779 de::MovePtr<BufferWithMemory> vertexBuffer =
6780 createVertexBufferAndFlush(vertexCount, 1u, VK_PRIMITIVE_TOPOLOGY_PATCH_LIST);
6781 const uint32_t invocationStride = vertexCount * TessCtrlRandomProgram::minSubgroupSize;
6782 DE_ASSERT(invocationStride < MAX_INVOCATIONS_ALL_TESTS);
6783
6784 log << tcu::TestLog::Message << "LayoutVertexOut: " << (uint32_t)TessCtrlRandomProgram::minSubgroupSize
6785 << tcu::TestLog::EndMessage;
6786 log << tcu::TestLog::Message << "patchControlPoints: " << patchControlPoints << tcu::TestLog::EndMessage;
6787 log << tcu::TestLog::Message << "primitiveStride: " << primitiveStride << tcu::TestLog::EndMessage;
6788 log << tcu::TestLog::Message << "invocationStride: " << invocationStride << tcu::TestLog::EndMessage;
6789 log << tcu::TestLog::Message << "usedSubgroupCount: " << m_data.sizeX << tcu::TestLog::EndMessage;
6790
6791 de::MovePtr<TessCtrlRandomProgram> program(new TessCtrlRandomProgram(m_data, invocationStride));
6792 program->generateRandomProgram(m_context.getTestContext().getWatchDog(), log);
6793
6794 std::vector<uint64_t> ref;
6795 const uint32_t simulationMaxLoc = program->simulate(true, m_subgroupSize, ref);
6796 log << tcu::TestLog::Message << "simulated maxLoc: " << simulationMaxLoc << tcu::TestLog::EndMessage;
6797 // maxLoc is per-invocation. Add one (to make sure no additional writes are done)
6798 uint32_t maxLoc = simulationMaxLoc;
6799 maxLoc += 1;
6800 maxLoc *= invocationStride;
6801
6802 constexpr uint32_t bufferCount = 3;
6803 enum Bindings
6804 {
6805 InputA,
6806 OutputBallots,
6807 OutputCounts,
6808 };
6809
6810 de::MovePtr<BufferWithMemory> buffers[bufferCount];
6811 vk::VkDescriptorBufferInfo bufferDescriptors[bufferCount];
6812
6813 VkDeviceSize sizes[bufferCount]{
6814 // InputA { uint a[]; } inputA; filled with a[i] == i
6815 invocationStride * sizeof(uint32_t),
6816 // OutputB { uvec2 b[]; } outputB;
6817 maxLoc * sizeof(uint64_t),
6818 // OutputC { uint loc[]; } outputC;
6819 invocationStride * sizeof(uint32_t),
6820 };
6821
6822 VkBufferUsageFlags usages[bufferCount]{
6823 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6824 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6825 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6826 };
6827
6828 // allocate buffers
6829 for (uint32_t i = 0; i < bufferCount; ++i)
6830 {
6831 if (sizes[i] > limits.maxStorageBufferRange)
6832 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
6833
6834 try
6835 {
6836 buffers[i] = de::MovePtr<BufferWithMemory>(
6837 new BufferWithMemory(vk, device, allocator,
6838 makeBufferCreateInfo(sizes[i], usages[i] | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
6839 VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
6840 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
6841 }
6842 catch (tcu::ResourceError &)
6843 {
6844 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6845 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6846 "Failed device memory allocation " + de::toString(sizes[i]) + " bytes");
6847 }
6848 bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, sizes[i]);
6849 }
6850
6851 // get raw pointers to previously allocated buffers
6852 void *ptrs[bufferCount];
6853 for (uint32_t i = 0; i < bufferCount; ++i)
6854 {
6855 ptrs[i] = (uint32_t *)buffers[i]->getAllocation().getHostPtr();
6856 }
6857
6858 // populate buffers with their destination
6859 {
6860 auto rangeBufferA = makeStdBeginEnd<uint32_t>(ptrs[InputA], invocationStride);
6861 std::iota(rangeBufferA.first, rangeBufferA.second, 0u);
6862 }
6863 deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
6864 deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
6865
6866 // (...) and flush them to the GPU
6867 for (uint32_t i = 0; i < bufferCount; ++i)
6868 {
6869 flushAlloc(vk, device, buffers[i]->getAllocation());
6870 }
6871
6872 VkDescriptorType descTypes[bufferCount]{
6873 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6874 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6875 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6876 };
6877
6878 vk::DescriptorSetLayoutBuilder layoutBuilder;
6879 for (uint32_t i = 0; i < bufferCount; ++i)
6880 {
6881 layoutBuilder.addSingleBinding(descTypes[i], m_data.shaderStage);
6882 }
6883 vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
6884
6885 vk::DescriptorPoolBuilder poolBuilder;
6886 for (uint32_t i = 0; i < bufferCount; ++i)
6887 {
6888 poolBuilder.addType(descTypes[i], 1);
6889 }
6890 vk::Unique<vk::VkDescriptorPool> descriptorPool(
6891 poolBuilder.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
6892 vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
6893
6894 vk::DescriptorSetUpdateBuilder setUpdateBuilder;
6895 for (uint32_t i = 0; i < bufferCount; ++i)
6896 {
6897 setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(i), descTypes[i],
6898 &bufferDescriptors[i]);
6899 }
6900 setUpdateBuilder.update(vk, device);
6901
6902 const VkPushConstantRange pushConstantRange{
6903 (VkShaderStageFlags)m_data.shaderStage, // VkShaderStageFlags stageFlags;
6904 0u, // uint32_t offset;
6905 sizeof(PushConstant) // uint32_t size;
6906 };
6907
6908 // TODO: verify that PushConstant is available on running machine
6909
6910 const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{
6911 VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
6912 nullptr, // pNext
6913 (VkPipelineLayoutCreateFlags)0, // flags
6914 1u, // setLayoutCount
6915 &descriptorSetLayout.get(), // pSetLayouts
6916 1u, // pushConstantRangeCount
6917 &pushConstantRange, // pPushConstantRanges
6918 };
6919
6920 const uint32_t imageWidth = 256;
6921 const uint32_t imageHeight = 256;
6922 const VkFormat format = VK_FORMAT_R8G8B8A8_UNORM;
6923 const VkImageCreateInfo imageCreateInfo{
6924 VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // VkStructureType sType;
6925 nullptr, // const void* pNext;
6926 VkImageCreateFlags(0), // VkImageCreateFlags flags;
6927 VK_IMAGE_TYPE_2D, // VkImageType imageType;
6928 format, // VkFormat format;
6929 {imageWidth, imageHeight, 1u}, // VkExtent3D extent;
6930 1u, // uint32_t mipLevels;
6931 1u, // uint32_t arrayLayers;
6932 VK_SAMPLE_COUNT_1_BIT, // VkSampleCountFlagBits samples;
6933 VK_IMAGE_TILING_OPTIMAL, // VkImageTiling tiling;
6934 VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, // VkImageUsageFlags usage;
6935 VK_SHARING_MODE_EXCLUSIVE, // VkSharingMode sharingMode;
6936 0u, // uint32_t queueFamilyIndexCount;
6937 0u, // const uint32_t* pQueueFamilyIndices;
6938 VK_IMAGE_LAYOUT_UNDEFINED // VkImageLayout initialLayout;
6939 };
6940 const VkImageSubresourceRange rscRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
6941 de::MovePtr<ImageWithMemory> image(
6942 new ImageWithMemory(vk, device, allocator, imageCreateInfo, vk::MemoryRequirement::Any));
6943 Move<VkImageView> view = makeImageView(vk, device, **image, VK_IMAGE_VIEW_TYPE_2D, format, rscRange);
6944 Move<VkRenderPass> renderPass = makeRenderPass(vk, device, format);
6945 Move<VkFramebuffer> framebuffer =
6946 makeFramebuffer(vk, device, *renderPass, *view, m_data.sizeX, m_data.sizeY, rscRange.layerCount);
6947 const VkRenderPassBeginInfo renderBeginInfo = makeRenderPassBeginInfo(*renderPass, *framebuffer);
6948 const Shaders shaders = createShaders();
6949 Move<VkPipelineLayout> pipelineLayout = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
6950 Move<VkPipeline> pipeline = createGraphicsPipeline(*pipelineLayout, *renderPass, imageWidth, imageHeight, shaders,
6951 VK_PRIMITIVE_TOPOLOGY_PATCH_LIST, patchControlPoints);
6952 Move<VkCommandPool> cmdPool =
6953 createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, queueIndex);
6954 Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
6955
6956 PushConstant pc{};
6957 pc.invocationStride = 0u;
6958 pc.width = TessCtrlRandomProgram::minSubgroupSize;
6959 pc.height = patchControlPoints;
6960 pc.primitiveStride = primitiveStride;
6961
6962 auto callRecordDrawingAndSubmit = std::bind(&ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit, this,
6963 *cmdBuffer, *pipelineLayout, *pipeline, *descriptorSet, std::cref(pc),
6964 std::cref(renderBeginInfo), **vertexBuffer, vertexCount, **image);
6965
6966 // compute "maxLoc", which is a potential maximum number of locations written
6967 callRecordDrawingAndSubmit();
6968
6969 // Take the maximum of "maxLoc" over all invocations.
6970 invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6971 auto rangeLoc = makeStdBeginEnd<const uint32_t>(ptrs[OutputCounts], invocationStride);
6972 const uint32_t computedShaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
6973 log << tcu::TestLog::Message << "computed shaderMaxLoc: " << computedShaderMaxLoc << tcu::TestLog::EndMessage;
6974
6975 // If we need more space, reallocate OutputB::b[] aka buffers[1]
6976 if (computedShaderMaxLoc > simulationMaxLoc)
6977 {
6978 // Add one (to make sure no additional writes are done) and multiply by
6979 // the number of invocations and current primitive count
6980 maxLoc = (computedShaderMaxLoc + 1) * invocationStride;
6981 sizes[OutputBallots] = maxLoc * sizeof(uint64_t);
6982
6983 if (sizes[OutputBallots] > limits.maxStorageBufferRange)
6984 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
6985
6986 try
6987 {
6988 buffers[OutputBallots] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
6989 vk, device, allocator,
6990 makeBufferCreateInfo(sizes[1], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
6991 VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
6992 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
6993 }
6994 catch (tcu::ResourceError &)
6995 {
6996 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6997 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6998 "Failed device memory allocation " + de::toString(sizes[OutputBallots]) + " bytes");
6999 }
7000 bufferDescriptors[OutputBallots] = makeDescriptorBufferInfo(**buffers[OutputBallots], 0, sizes[OutputBallots]);
7001 ptrs[OutputBallots] = buffers[OutputBallots]->getAllocation().getHostPtr();
7002
7003 vk::DescriptorSetUpdateBuilder setUpdateBuilder2;
7004 setUpdateBuilder2.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(OutputBallots),
7005 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[OutputBallots]);
7006 setUpdateBuilder2.update(vk, device);
7007 }
7008
7009 // Clear any writes to ballots/stores OutputB::b[] aka buffer[1] during the counting pass
7010 // Note that its size would may change since the first memory allocation
7011 deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
7012 // Clear any writes to counting OutputC::loc[] aka buffer[2] during the counting pass
7013 deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
7014
7015 // flush them all to the GPU
7016 flushAlloc(vk, device, buffers[OutputBallots]->getAllocation());
7017 flushAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7018
7019 // run the actual shader with updated PushConstant
7020 pc.invocationStride = invocationStride;
7021 pc.width = TessCtrlRandomProgram::minSubgroupSize;
7022 pc.height = patchControlPoints;
7023 pc.primitiveStride = primitiveStride;
7024 callRecordDrawingAndSubmit();
7025
7026 invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7027 const uint32_t finalShaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
7028 log << tcu::TestLog::Message << "final shaderMaxLoc: " << finalShaderMaxLoc << tcu::TestLog::EndMessage;
7029 if (finalShaderMaxLoc > computedShaderMaxLoc)
7030 {
7031 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING, "maxLoc differs across shader invocations");
7032 }
7033
7034 invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7035 invalidateAlloc(vk, device, buffers[OutputBallots]->getAllocation());
7036
7037 // Simulate execution on the CPU, and compare against the GPU result
7038 try
7039 {
7040 ref.resize(maxLoc, 0ull);
7041 }
7042 catch (const std::bad_alloc &)
7043 {
7044 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7045 return tcu::TestStatus(QP_TEST_RESULT_NOT_SUPPORTED,
7046 "Failed system memory allocation " + de::toString(maxLoc * sizeof(uint64_t)) + " bytes");
7047 }
7048
7049 program->simulate(false, m_subgroupSize, ref);
7050
7051 const uint64_t *ballots = static_cast<uint64_t *>(ptrs[OutputBallots]);
7052 qpTestResult res = calculateAndLogResult(ballots, ref, invocationStride, m_subgroupSize, finalShaderMaxLoc,
7053 (invocationStride / 3), PrintMode::None);
7054
7055 return tcu::TestStatus(res, qpGetTestResultName(res));
7056 }
7057
createShaders(void)7058 std::vector<Move<VkShaderModule>> ReconvergenceTestTessEvalInstance::createShaders(void)
7059 {
7060 const DeviceInterface &vk = m_context.getDeviceInterface();
7061 const VkDevice device = m_context.getDevice();
7062
7063 Move<VkShaderModule> vertex = createShaderModule(vk, device, m_context.getBinaryCollection().get("vert"));
7064 Move<VkShaderModule> fragment = createShaderModule(vk, device, m_context.getBinaryCollection().get("frag"));
7065 Move<VkShaderModule> control = createShaderModule(vk, device, m_context.getBinaryCollection().get("tesc"));
7066 Move<VkShaderModule> evaluation = createShaderModule(vk, device, m_context.getBinaryCollection().get("test"));
7067
7068 // { #vert, #frag, #tesc, #tese, geom }; if any
7069 std::vector<Move<VkShaderModule>> shaders;
7070 shaders.emplace_back(vertex);
7071 shaders.emplace_back(fragment);
7072 shaders.emplace_back(control);
7073 shaders.emplace_back(evaluation);
7074
7075 return shaders;
7076 }
7077
iterate(void)7078 tcu::TestStatus ReconvergenceTestTessEvalInstance::iterate(void)
7079 {
7080 const DeviceInterface &vk = m_context.getDeviceInterface();
7081 const VkDevice device = m_context.getDevice();
7082 Allocator &allocator = m_context.getDefaultAllocator();
7083 const uint32_t queueIndex = m_context.getUniversalQueueFamilyIndex();
7084 add_ref<tcu::TestLog> log = m_context.getTestContext().getLog();
7085
7086 if (m_subgroupSize < TessEvalRandomProgram::quadInvocationCount || m_subgroupSize > 64)
7087 {
7088 std::stringstream str;
7089 str << "Subgroup size less than " << TessEvalRandomProgram::quadInvocationCount
7090 << " or greater than 64 not handled.";
7091 str.flush();
7092 TCU_THROW(TestError, str.str());
7093 }
7094
7095 deRandom rnd;
7096 deRandom_init(&rnd, m_data.seed);
7097
7098 vk::VkPhysicalDeviceProperties2 properties2;
7099 deMemset(&properties2, 0, sizeof(properties2));
7100 properties2.sType = vk::VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
7101 m_context.getInstanceInterface().getPhysicalDeviceProperties2(m_context.getPhysicalDevice(), &properties2);
7102 const VkPhysicalDeviceLimits &limits = properties2.properties.limits;
7103
7104 const uint32_t patchesPerGroup = m_subgroupSize / TessEvalRandomProgram::quadInvocationCount;
7105 const uint32_t primitiveStride = patchesPerGroup * m_data.sizeX;
7106 const uint32_t invocationStride = primitiveStride * TessEvalRandomProgram::quadInvocationCount;
7107 const std::vector<tcu::Vec4> vertices = generateVertices(invocationStride, VK_PRIMITIVE_TOPOLOGY_POINT_LIST);
7108 const uint32_t vertexCount = uint32_t(vertices.size());
7109 de::MovePtr<BufferWithMemory> vertexBuffer = createVertexBufferAndFlush(vertices);
7110
7111 DE_ASSERT(invocationStride <= MAX_INVOCATIONS_ALL_TESTS);
7112
7113 de::MovePtr<TessEvalRandomProgram> program(new TessEvalRandomProgram(m_data, invocationStride));
7114 program->generateRandomProgram(m_context.getTestContext().getWatchDog(), log);
7115
7116 std::vector<uint64_t> ref;
7117 const uint32_t simulationMaxLoc = program->simulate(true, m_subgroupSize, ref);
7118 log << tcu::TestLog::Message << "simulated maxLoc: " << simulationMaxLoc << tcu::TestLog::EndMessage;
7119 log << tcu::TestLog::Message << "effective patch size: " << m_data.sizeY << tcu::TestLog::EndMessage;
7120 log << tcu::TestLog::Message << "effective patch count: " << primitiveStride << tcu::TestLog::EndMessage;
7121 log << tcu::TestLog::Message << "total invocation count: " << invocationStride << tcu::TestLog::EndMessage;
7122
7123 // maxLoc is per-invocation. Add one (to make sure no additional writes are done).
7124 uint32_t maxLoc = simulationMaxLoc;
7125 maxLoc += 1;
7126 maxLoc *= invocationStride;
7127
7128 constexpr uint32_t bufferCount = 3;
7129 enum Bindings
7130 {
7131 InputA,
7132 OutputBallots,
7133 OutputCounts,
7134 };
7135
7136 de::MovePtr<BufferWithMemory> buffers[bufferCount];
7137 vk::VkDescriptorBufferInfo bufferDescriptors[bufferCount];
7138
7139 VkDeviceSize sizes[bufferCount]{
7140 // InputA { uint a[]; } inputA; filled with a[i] == i
7141 invocationStride * sizeof(uint32_t),
7142 // OutputB { uvec2 b[]; } outputB;
7143 maxLoc * sizeof(uint64_t),
7144 // OutputC { uint loc[]; } outputC;
7145 invocationStride * sizeof(uint32_t),
7146 };
7147
7148 VkBufferUsageFlags usages[bufferCount]{
7149 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7150 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7151 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7152 };
7153
7154 // allocate buffers
7155 for (uint32_t i = 0; i < bufferCount; ++i)
7156 {
7157 if (sizes[i] > limits.maxStorageBufferRange)
7158 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
7159
7160 try
7161 {
7162 buffers[i] = de::MovePtr<BufferWithMemory>(
7163 new BufferWithMemory(vk, device, allocator,
7164 makeBufferCreateInfo(sizes[i], usages[i] | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
7165 VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
7166 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
7167 }
7168 catch (tcu::ResourceError &)
7169 {
7170 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7171 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
7172 "Failed device memory allocation " + de::toString(sizes[i]) + " bytes");
7173 }
7174 bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, sizes[i]);
7175 }
7176
7177 // get raw pointers to previously allocated buffers
7178 void *ptrs[bufferCount];
7179 for (uint32_t i = 0; i < bufferCount; ++i)
7180 {
7181 ptrs[i] = (uint32_t *)buffers[i]->getAllocation().getHostPtr();
7182 }
7183
7184 // populate buffers with their destination
7185 {
7186 auto rangeBufferA = makeStdBeginEnd<uint32_t>(ptrs[InputA], invocationStride);
7187 std::iota(rangeBufferA.first, rangeBufferA.second, 0u);
7188 }
7189 deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
7190 deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
7191
7192 // (...) and flush them to the GPU
7193 for (uint32_t i = 0; i < bufferCount; ++i)
7194 {
7195 flushAlloc(vk, device, buffers[i]->getAllocation());
7196 }
7197
7198 VkDescriptorType descTypes[bufferCount]{
7199 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7200 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7201 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7202 };
7203
7204 vk::DescriptorSetLayoutBuilder layoutBuilder;
7205 for (uint32_t i = 0; i < bufferCount; ++i)
7206 {
7207 layoutBuilder.addSingleBinding(descTypes[i], m_data.shaderStage);
7208 }
7209 vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
7210
7211 vk::DescriptorPoolBuilder poolBuilder;
7212 for (uint32_t i = 0; i < bufferCount; ++i)
7213 {
7214 poolBuilder.addType(descTypes[i], 1);
7215 }
7216 vk::Unique<vk::VkDescriptorPool> descriptorPool(
7217 poolBuilder.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
7218 vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
7219
7220 vk::DescriptorSetUpdateBuilder setUpdateBuilder;
7221 for (uint32_t i = 0; i < bufferCount; ++i)
7222 {
7223 setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(i), descTypes[i],
7224 &bufferDescriptors[i]);
7225 }
7226 setUpdateBuilder.update(vk, device);
7227
7228 const VkPushConstantRange pushConstantRange{
7229 (VkShaderStageFlags)m_data.shaderStage, // VkShaderStageFlags stageFlags;
7230 0u, // uint32_t offset;
7231 sizeof(PushConstant) // uint32_t size;
7232 };
7233
7234 // TODO: verify that PushConstant is available on running machine
7235
7236 const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{
7237 VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
7238 nullptr, // pNext
7239 (VkPipelineLayoutCreateFlags)0, // flags
7240 1u, // setLayoutCount
7241 &descriptorSetLayout.get(), // pSetLayouts
7242 1u, // pushConstantRangeCount
7243 &pushConstantRange, // pPushConstantRanges
7244 };
7245
7246 const uint32_t imageWidth = 256;
7247 const uint32_t imageHeight = 256;
7248 const VkFormat format = VK_FORMAT_R8G8B8A8_UNORM;
7249 const VkImageCreateInfo imageCreateInfo{
7250 VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // VkStructureType sType;
7251 nullptr, // const void* pNext;
7252 VkImageCreateFlags(0), // VkImageCreateFlags flags;
7253 VK_IMAGE_TYPE_2D, // VkImageType imageType;
7254 format, // VkFormat format;
7255 {imageWidth, imageHeight, 1u}, // VkExtent3D extent;
7256 1u, // uint32_t mipLevels;
7257 1u, // uint32_t arrayLayers;
7258 VK_SAMPLE_COUNT_1_BIT, // VkSampleCountFlagBits samples;
7259 VK_IMAGE_TILING_OPTIMAL, // VkImageTiling tiling;
7260 VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, // VkImageUsageFlags usage;
7261 VK_SHARING_MODE_EXCLUSIVE, // VkSharingMode sharingMode;
7262 0u, // uint32_t queueFamilyIndexCount;
7263 0u, // const uint32_t* pQueueFamilyIndices;
7264 VK_IMAGE_LAYOUT_UNDEFINED // VkImageLayout initialLayout;
7265 };
7266 const VkImageSubresourceRange rscRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
7267 de::MovePtr<ImageWithMemory> image(
7268 new ImageWithMemory(vk, device, allocator, imageCreateInfo, vk::MemoryRequirement::Any));
7269 Move<VkImageView> view = makeImageView(vk, device, **image, VK_IMAGE_VIEW_TYPE_2D, format, rscRange);
7270 Move<VkRenderPass> renderPass = makeRenderPass(vk, device, format);
7271 Move<VkFramebuffer> framebuffer =
7272 makeFramebuffer(vk, device, *renderPass, *view, m_data.sizeX, m_data.sizeY, rscRange.layerCount);
7273 const VkRenderPassBeginInfo renderBeginInfo = makeRenderPassBeginInfo(*renderPass, *framebuffer);
7274 const Shaders shaders = createShaders();
7275 Move<VkPipelineLayout> pipelineLayout = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
7276 Move<VkPipeline> pipeline =
7277 createGraphicsPipeline(*pipelineLayout, *renderPass, imageWidth, imageHeight, shaders,
7278 VK_PRIMITIVE_TOPOLOGY_PATCH_LIST, TessEvalRandomProgram::quadInvocationCount);
7279 Move<VkCommandPool> cmdPool =
7280 createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, queueIndex);
7281 Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
7282
7283 PushConstant pc{};
7284 pc.invocationStride = 0u;
7285 pc.width = TessEvalRandomProgram::quadInvocationCount;
7286
7287 auto callRecordDrawingAndSubmit = std::bind(&ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit, this,
7288 *cmdBuffer, *pipelineLayout, *pipeline, *descriptorSet, std::cref(pc),
7289 std::cref(renderBeginInfo), **vertexBuffer, vertexCount, **image);
7290
7291 // compute "maxLoc", which is a potential maximum number of locations written
7292 callRecordDrawingAndSubmit();
7293
7294 // Take the maximum of "maxLoc" over all invocations.
7295 invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7296 auto rangeLoc = makeStdBeginEnd<const uint32_t>(ptrs[OutputCounts], invocationStride);
7297 const uint32_t computedShaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
7298 log << tcu::TestLog::Message << "computed shaderMaxLoc: " << computedShaderMaxLoc << tcu::TestLog::EndMessage;
7299
7300 // If we need more space, reallocate OutputB::b[] aka buffers[1]
7301 if (computedShaderMaxLoc > simulationMaxLoc)
7302 {
7303 // Add one (to make sure no additional writes are done) and multiply by
7304 // the number of invocations and current primitive count
7305 maxLoc = (computedShaderMaxLoc + 1) * invocationStride;
7306 sizes[OutputBallots] = maxLoc * sizeof(uint64_t);
7307
7308 if (sizes[OutputBallots] > limits.maxStorageBufferRange)
7309 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
7310
7311 try
7312 {
7313 buffers[OutputBallots] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
7314 vk, device, allocator,
7315 makeBufferCreateInfo(sizes[1], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
7316 VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
7317 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
7318 }
7319 catch (tcu::ResourceError &)
7320 {
7321 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7322 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
7323 "Failed device memory allocation " + de::toString(sizes[OutputBallots]) + " bytes");
7324 }
7325 bufferDescriptors[OutputBallots] = makeDescriptorBufferInfo(**buffers[OutputBallots], 0, sizes[OutputBallots]);
7326 ptrs[OutputBallots] = buffers[OutputBallots]->getAllocation().getHostPtr();
7327
7328 vk::DescriptorSetUpdateBuilder setUpdateBuilder2;
7329 setUpdateBuilder2.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(OutputBallots),
7330 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[OutputBallots]);
7331 setUpdateBuilder2.update(vk, device);
7332 }
7333
7334 // Clear any writes to ballots/stores OutputB::b[] aka buffer[1] during the counting pass
7335 // Note that its size would may change since the first memory allocation
7336 deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
7337 // Clear any writes to counting OutputC::loc[] aka buffer[2] during the counting pass
7338 deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
7339
7340 // flush them all to the GPU
7341 flushAlloc(vk, device, buffers[OutputBallots]->getAllocation());
7342 flushAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7343
7344 // run the actual shader with updated PushConstant
7345 pc.invocationStride = invocationStride;
7346 pc.width = TessEvalRandomProgram::quadInvocationCount;
7347 callRecordDrawingAndSubmit();
7348
7349 invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7350 const uint32_t finalShaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
7351 log << tcu::TestLog::Message << "final shaderMaxLoc: " << finalShaderMaxLoc << tcu::TestLog::EndMessage;
7352 if (finalShaderMaxLoc > computedShaderMaxLoc)
7353 {
7354 std::stringstream s;
7355 s << "maxLoc differs across shader invocations: " << finalShaderMaxLoc << " and " << computedShaderMaxLoc;
7356 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING, s.str());
7357 }
7358
7359 invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7360 invalidateAlloc(vk, device, buffers[OutputBallots]->getAllocation());
7361
7362 // Simulate execution on the CPU, and compare against the GPU result
7363 try
7364 {
7365 ref.resize(maxLoc, 0ull);
7366 }
7367 catch (const std::bad_alloc &)
7368 {
7369 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7370 return tcu::TestStatus(QP_TEST_RESULT_NOT_SUPPORTED,
7371 "Failed system memory allocation " + de::toString(maxLoc * sizeof(uint64_t)) + " bytes");
7372 }
7373
7374 program->simulate(false, m_subgroupSize, ref);
7375
7376 const uint64_t *ballots = static_cast<uint64_t *>(ptrs[OutputBallots]);
7377 qpTestResult res = calculateAndLogResult(ballots, ref, invocationStride, m_subgroupSize, finalShaderMaxLoc,
7378 (invocationStride / 3), PrintMode::None);
7379
7380 return tcu::TestStatus(res, qpGetTestResultName(res));
7381 }
7382
createVertexBufferAndFlush(uint32_t cellsHorz,uint32_t cellsVert,VkPrimitiveTopology topology)7383 de::MovePtr<BufferWithMemory> ReconvergenceTestGeometryInstance::createVertexBufferAndFlush(
7384 uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
7385 {
7386 DE_UNREF(topology);
7387 DE_ASSERT(VK_PRIMITIVE_TOPOLOGY_POINT_LIST == topology);
7388 const std::vector<tcu::Vec4> vertices = GeometryRandomProgram::Arrangement::generatePrimitives(
7389 cellsHorz, cellsVert, GeometryRandomProgram::fillPercentage);
7390 return ReconvergenceTestGraphicsInstance::createVertexBufferAndFlush(vertices);
7391 }
7392
createShaders(void)7393 std::vector<Move<VkShaderModule>> ReconvergenceTestGeometryInstance::createShaders(void)
7394 {
7395 const DeviceInterface &vk = m_context.getDeviceInterface();
7396 const VkDevice device = m_context.getDevice();
7397
7398 Move<VkShaderModule> vertex = createShaderModule(vk, device, m_context.getBinaryCollection().get("vert"));
7399 Move<VkShaderModule> fragment = createShaderModule(vk, device, m_context.getBinaryCollection().get("frag"));
7400 Move<VkShaderModule> geometry = createShaderModule(vk, device, m_context.getBinaryCollection().get("test"));
7401
7402 // { #vert, #frag, tesc, tese, #geom }; if any
7403 std::vector<Move<VkShaderModule>> shaders;
7404 shaders.emplace_back(vertex);
7405 shaders.emplace_back(fragment);
7406 shaders.emplace_back();
7407 shaders.emplace_back();
7408 shaders.emplace_back(geometry);
7409
7410 return shaders;
7411 }
7412
iterate(void)7413 tcu::TestStatus ReconvergenceTestGeometryInstance::iterate(void)
7414 {
7415 const VkPhysicalDeviceLimits &limits = m_context.getDeviceProperties().limits;
7416 if (sizeof(PushConstant) > limits.maxPushConstantsSize)
7417 {
7418 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
7419 "PushConstant size " + std::to_string(sizeof(PushConstant)) + " exceeds device limit " +
7420 std::to_string(limits.maxPushConstantsSize));
7421 }
7422
7423 const DeviceInterface &vk = m_context.getDeviceInterface();
7424 const VkDevice device = m_context.getDevice();
7425 Allocator &allocator = m_context.getDefaultAllocator();
7426 const uint32_t queueIndex = m_context.getUniversalQueueFamilyIndex();
7427 add_ref<tcu::TestLog> log = m_context.getTestContext().getLog();
7428 const VkPrimitiveTopology topology = VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
7429 const uint32_t fragmentStride = uint32_t(m_data.sizeX * m_data.sizeY);
7430 const uint32_t invocationStride = GeometryRandomProgram::Arrangement::calculatePrimitiveCount(
7431 m_data.sizeX, m_data.sizeY, GeometryRandomProgram::fillPercentage);
7432
7433 de::MovePtr<GeometryRandomProgram> program(new GeometryRandomProgram(m_data));
7434 program->generateRandomProgram(m_context.getTestContext().getWatchDog(), log);
7435
7436 // simulate content of outputP buffer
7437 std::vector<uint32_t> outputP =
7438 GeometryRandomProgram::Arrangement::generateVectorOutputP(m_subgroupSize, invocationStride);
7439
7440 std::vector<tcu::UVec4> ref;
7441 const uint32_t hostMaxLoc = program->execute(m_context.getTestContext().getWatchDog(), true, m_subgroupSize,
7442 fragmentStride, invocationStride, ref, log, outputP, nullptr);
7443 log << tcu::TestLog::Message << "Rendering area : " << tcu::UVec2(m_data.sizeX, m_data.sizeY)
7444 << tcu::TestLog::EndMessage;
7445 log << tcu::TestLog::Message << "invocationStride: " << invocationStride << tcu::TestLog::EndMessage;
7446 log << tcu::TestLog::Message << "Simulated maxLoc: " << hostMaxLoc << tcu::TestLog::EndMessage;
7447 // maxLoc is per-invocation. Add one (to make sure no additional writes are done).
7448 uint32_t maxLoc = hostMaxLoc;
7449 maxLoc += 1;
7450 maxLoc *= invocationStride;
7451
7452 constexpr uint32_t bufferCount = 4u;
7453 enum Bindings
7454 {
7455 InputA,
7456 OutputBallots,
7457 OutputCounts,
7458 OutputPrimitives
7459 };
7460
7461 de::MovePtr<BufferWithMemory> buffers[bufferCount];
7462 vk::VkDescriptorBufferInfo bufferDescriptors[bufferCount];
7463
7464 uint32_t counts[bufferCount]{// InputA { uint a[]; } inputA;
7465 uint32_t(m_data.sizeX * m_data.sizeY),
7466 // OutputB { uvec2 b[]; } outputB;
7467 maxLoc,
7468 // OutputC { uint loc[]; } outputC;
7469 invocationStride,
7470 // OutputP { uint p[]; } outputP;
7471 uint32_t(outputP.size())};
7472
7473 VkDeviceSize sizes[bufferCount]{// InputA { uint a[]; } inputA;
7474 counts[InputA] * sizeof(uint32_t),
7475 // OutputB { uvec2 b[]; } outputB;
7476 counts[OutputBallots] * sizeof(tcu::UVec4),
7477 // OutputC { uint loc[]; } outputC;
7478 counts[OutputCounts] * sizeof(uint32_t),
7479 // OutputP { uint p[]; } outputP;
7480 counts[OutputPrimitives] * sizeof(uint32_t)};
7481
7482 const VkBufferUsageFlags cmnUsages = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
7483 VkBufferUsageFlags usages[bufferCount]{
7484 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7485 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7486 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7487 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7488 };
7489
7490 // allocate buffers
7491 for (uint32_t i = 0; i < bufferCount; ++i)
7492 {
7493 if (sizes[i] > limits.maxStorageBufferRange)
7494 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
7495 try
7496 {
7497 buffers[i] = de::MovePtr<BufferWithMemory>(
7498 new BufferWithMemory(vk, device, allocator, makeBufferCreateInfo(sizes[i], usages[i] | cmnUsages),
7499 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
7500 }
7501 catch (tcu::ResourceError &)
7502 {
7503 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7504 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
7505 "Failed device memory allocation " + de::toString(sizes[i]) + " bytes");
7506 }
7507 bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, sizes[i]);
7508 }
7509
7510 // get raw pointers to previously allocated buffers
7511 void *ptrs[bufferCount];
7512 for (uint32_t i = 0; i < bufferCount; ++i)
7513 {
7514 ptrs[i] = (uint32_t *)buffers[i]->getAllocation().getHostPtr();
7515 }
7516
7517 // populate buffers with their destination
7518 {
7519 auto rangeBufferA = makeStdBeginEnd<uint32_t>(ptrs[InputA], counts[InputA]);
7520 std::iota(rangeBufferA.first, rangeBufferA.second, 0u);
7521 }
7522 deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
7523 deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
7524 deMemset(ptrs[OutputPrimitives], 0, (size_t)sizes[OutputPrimitives]);
7525
7526 // (...) and flush them to the GPU
7527 for (uint32_t i = 0; i < bufferCount; ++i)
7528 {
7529 flushAlloc(vk, device, buffers[i]->getAllocation());
7530 }
7531
7532 VkDescriptorType descTypes[bufferCount]{
7533 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7534 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7535 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7536 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7537 };
7538
7539 vk::DescriptorSetLayoutBuilder layoutBuilder;
7540 for (uint32_t i = 0; i < bufferCount; ++i)
7541 {
7542 layoutBuilder.addSingleBinding(descTypes[i], m_data.shaderStage);
7543 }
7544 vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
7545
7546 vk::DescriptorPoolBuilder poolBuilder;
7547 for (uint32_t i = 0; i < bufferCount; ++i)
7548 {
7549 poolBuilder.addType(descTypes[i], 1);
7550 }
7551 vk::Unique<vk::VkDescriptorPool> descriptorPool(
7552 poolBuilder.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
7553 vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
7554
7555 vk::DescriptorSetUpdateBuilder setUpdateBuilder;
7556 for (uint32_t i = 0; i < bufferCount; ++i)
7557 {
7558 setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(i), descTypes[i],
7559 &bufferDescriptors[i]);
7560 }
7561 setUpdateBuilder.update(vk, device);
7562
7563 const VkPushConstantRange pushConstantRange{
7564 (VkShaderStageFlags)m_data.shaderStage, // VkShaderStageFlags stageFlags;
7565 0u, // uint32_t offset;
7566 sizeof(PushConstant) // uint32_t size;
7567 };
7568
7569 const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{
7570 VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
7571 nullptr, // pNext
7572 (VkPipelineLayoutCreateFlags)0, // flags
7573 1u, // setLayoutCount
7574 &descriptorSetLayout.get(), // pSetLayouts
7575 1u, // pushConstantRangeCount
7576 &pushConstantRange, // pPushConstantRanges
7577 };
7578
7579 const uint32_t imageWidth = m_data.sizeX;
7580 const uint32_t imageHeight = m_data.sizeY;
7581 const VkFormat format = VK_FORMAT_R8G8B8A8_UNORM;
7582 const VkImageCreateInfo imageCreateInfo{
7583 VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // VkStructureType sType;
7584 nullptr, // const void* pNext;
7585 VkImageCreateFlags(0), // VkImageCreateFlags flags;
7586 VK_IMAGE_TYPE_2D, // VkImageType imageType;
7587 format, // VkFormat format;
7588 {imageWidth, imageHeight, 1u}, // VkExtent3D extent;
7589 1u, // uint32_t mipLevels;
7590 1u, // uint32_t arrayLayers;
7591 VK_SAMPLE_COUNT_1_BIT, // VkSampleCountFlagBits samples;
7592 VK_IMAGE_TILING_OPTIMAL, // VkImageTiling tiling;
7593 VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, // VkImageUsageFlags usage;
7594 VK_SHARING_MODE_EXCLUSIVE, // VkSharingMode sharingMode;
7595 0u, // uint32_t queueFamilyIndexCount;
7596 0u, // const uint32_t* pQueueFamilyIndices;
7597 VK_IMAGE_LAYOUT_UNDEFINED // VkImageLayout initialLayout;
7598 };
7599 const VkImageSubresourceRange rscRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
7600 de::MovePtr<ImageWithMemory> image(
7601 new ImageWithMemory(vk, device, allocator, imageCreateInfo, vk::MemoryRequirement::Any));
7602 Move<VkImageView> view = makeImageView(vk, device, **image, VK_IMAGE_VIEW_TYPE_2D, format, rscRange);
7603 Move<VkRenderPass> renderPass = makeRenderPass(vk, device, format);
7604 Move<VkFramebuffer> framebuffer =
7605 makeFramebuffer(vk, device, *renderPass, *view, m_data.sizeX, m_data.sizeY, rscRange.layerCount);
7606 de::MovePtr<BufferWithMemory> vertexBuffer = createVertexBufferAndFlush(m_data.sizeX, m_data.sizeY, topology);
7607 const VkRenderPassBeginInfo renderBeginInfo = makeRenderPassBeginInfo(*renderPass, *framebuffer);
7608 const Shaders shaders = createShaders();
7609 Move<VkPipelineLayout> pipelineLayout = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
7610 Move<VkPipeline> pipeline = createGraphicsPipeline(*pipelineLayout, *renderPass, imageWidth, imageHeight, shaders,
7611 VK_PRIMITIVE_TOPOLOGY_POINT_LIST);
7612 Move<VkCommandPool> cmdPool =
7613 createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, queueIndex);
7614 Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
7615
7616 PushConstant pc{};
7617 pc.invocationStride = invocationStride;
7618 pc.width = m_data.sizeX;
7619 pc.height = m_data.sizeY;
7620 pc.enableInvocationIndex = VK_FALSE;
7621
7622 auto callRecordDrawingAndSubmit = std::bind(&ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit, this,
7623 *cmdBuffer, *pipelineLayout, *pipeline, *descriptorSet, std::cref(pc),
7624 std::cref(renderBeginInfo), **vertexBuffer, invocationStride, **image);
7625
7626 // compute "maxLoc", which is a potential maximum number of locations written
7627 callRecordDrawingAndSubmit();
7628
7629 // Take the maximum of "maxLoc" over all invocations.
7630 invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7631 auto rangeLoc = makeStdBeginEnd<const uint32_t>(ptrs[OutputCounts], invocationStride);
7632 const uint32_t shaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
7633 log << tcu::TestLog::Message << "computed maxLoc in shader: " << shaderMaxLoc << tcu::TestLog::EndMessage;
7634
7635 // If we need more space, reallocate OutputB::b[] aka buffers[1]
7636 if (shaderMaxLoc > hostMaxLoc)
7637 {
7638 // Add one (to make sure no additional writes are done) and multiply by
7639 // the number of invocations and current primitive count
7640 maxLoc = (std::max(shaderMaxLoc, hostMaxLoc) + 1u) * invocationStride;
7641 counts[OutputBallots] = maxLoc;
7642 sizes[OutputBallots] = counts[OutputBallots] * sizeof(tcu::UVec4);
7643
7644 if (sizes[OutputBallots] > limits.maxStorageBufferRange)
7645 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
7646
7647 try
7648 {
7649 buffers[OutputBallots] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
7650 vk, device, allocator, makeBufferCreateInfo(sizes[1], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | cmnUsages),
7651 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
7652 }
7653 catch (tcu::ResourceError &)
7654 {
7655 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7656 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
7657 "Failed device memory allocation " + de::toString(sizes[OutputBallots]) + " bytes");
7658 }
7659 bufferDescriptors[OutputBallots] = makeDescriptorBufferInfo(**buffers[OutputBallots], 0, sizes[OutputBallots]);
7660 ptrs[OutputBallots] = buffers[OutputBallots]->getAllocation().getHostPtr();
7661
7662 vk::DescriptorSetUpdateBuilder setUpdateBuilder2;
7663 setUpdateBuilder2.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(OutputBallots),
7664 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[OutputBallots]);
7665 setUpdateBuilder2.update(vk, device);
7666 }
7667
7668 // Clear any writes to ballots/stores OutputB::b[] aka buffer[1] during the counting pass
7669 // Note that its size would may change since the first memory allocation
7670 deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
7671 deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
7672 deMemset(ptrs[OutputPrimitives], 0, (size_t)sizes[OutputPrimitives]);
7673
7674 // flush them all to the GPU
7675 flushAlloc(vk, device, buffers[OutputBallots]->getAllocation());
7676 flushAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7677 flushAlloc(vk, device, buffers[OutputPrimitives]->getAllocation());
7678
7679 // run the actual shader with updated PushConstant
7680 pc.enableInvocationIndex = VK_TRUE;
7681 callRecordDrawingAndSubmit();
7682
7683 invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7684 const uint32_t finalShaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
7685 log << tcu::TestLog::Message << "final shaderMaxLoc: " << finalShaderMaxLoc << tcu::TestLog::EndMessage;
7686 if (finalShaderMaxLoc != shaderMaxLoc)
7687 {
7688 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
7689 "maxLoc differs across shader invocations, expected: " + de::toString(shaderMaxLoc) +
7690 " got: " + de::toString(finalShaderMaxLoc));
7691 }
7692
7693 invalidateAlloc(vk, device, buffers[OutputBallots]->getAllocation());
7694 const tcu::UVec4 *ballots = static_cast<tcu::UVec4 *>(ptrs[OutputBallots]);
7695
7696 invalidateAlloc(vk, device, buffers[OutputPrimitives]->getAllocation());
7697 auto outputPrange = makeStdBeginEnd<uint32_t>(ptrs[OutputPrimitives], counts[OutputPrimitives]);
7698 std::copy(outputPrange.first, outputPrange.second, outputP.begin());
7699
7700 try
7701 {
7702 ref.resize(counts[OutputBallots], tcu::UVec4(0u, 0u, 0u, 0u));
7703 }
7704 catch (const std::bad_alloc &)
7705 {
7706 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7707 return tcu::TestStatus(QP_TEST_RESULT_NOT_SUPPORTED,
7708 "Failed system memory allocation " + de::toString(maxLoc * sizeof(uint64_t)) + " bytes");
7709 }
7710
7711 // Simulate execution on the CPU, and compare against the GPU result
7712 const uint32_t finalHostMaxLoc = program->execute(m_context.getTestContext().getWatchDog(), false, m_subgroupSize,
7713 fragmentStride, invocationStride, ref, log, outputP, ballots);
7714
7715 const qpTestResult res = calculateAndLogResultEx(log, ballots, ref, finalHostMaxLoc, PrintMode::None);
7716
7717 return tcu::TestStatus(res, qpGetTestResultName(res));
7718 }
7719
calculateAndLogResultEx(add_ref<tcu::TestLog> log,const tcu::UVec4 * result,const std::vector<tcu::UVec4> & ref,const uint32_t maxLoc,const PrintMode printMode)7720 qpTestResult_e ReconvergenceTestGeometryInstance::calculateAndLogResultEx(add_ref<tcu::TestLog> log,
7721 const tcu::UVec4 *result,
7722 const std::vector<tcu::UVec4> &ref,
7723 const uint32_t maxLoc,
7724 const PrintMode printMode)
7725 {
7726 DE_UNREF(maxLoc);
7727 DE_UNREF(printMode);
7728
7729 qpTestResult res = QP_TEST_RESULT_PASS;
7730 uint32_t mismatchCount = 0u;
7731 const uint32_t printMismatchCount = 5u;
7732
7733 // With maximal reconvergence, we should expect the output to exactly match the reference.
7734 const uint32_t ballotStoreCount = static_cast<uint32_t>(ref.size());
7735 for (uint32_t i = 0; i < ballotStoreCount; ++i)
7736 {
7737 const Ballot resultVal(result[i], m_subgroupSize);
7738 const Ballot refVal(ref.at(i), m_subgroupSize);
7739 if (resultVal != refVal)
7740 {
7741 if (mismatchCount++ < printMismatchCount)
7742 {
7743 res = QP_TEST_RESULT_FAIL;
7744 log << tcu::TestLog::Message << "Mismatch at " << i << "\nexpected: " << resultVal
7745 << "\n got: " << refVal << tcu::TestLog::EndMessage;
7746 if (printMode == PrintMode::Console)
7747 {
7748 std::cout << "Mismatch at " << i << "\nexpected: " << resultVal << "\n got: " << refVal
7749 << std::endl;
7750 }
7751 }
7752 }
7753 }
7754
7755 log << tcu::TestLog::Message << "Mismatch count: " << mismatchCount << " from " << ballotStoreCount
7756 << tcu::TestLog::EndMessage;
7757 if (printMode == PrintMode::Console)
7758 {
7759 std::cout << "Mismatch count: " << mismatchCount << " from " << ballotStoreCount << std::endl;
7760 }
7761
7762 return res;
7763 }
7764
7765 void createAmberFragmentTestCases(add_ref<tcu::TestContext> testCtx, add_ptr<tcu::TestCaseGroup> group);
7766
createTests(tcu::TestContext & testCtx,const std::string & name,bool createExperimental)7767 tcu::TestCaseGroup *createTests(tcu::TestContext &testCtx, const std::string &name, bool createExperimental)
7768 {
7769 de::MovePtr<tcu::TestCaseGroup> group(new tcu::TestCaseGroup(testCtx, name.c_str(), "reconvergence tests"));
7770
7771 typedef struct
7772 {
7773 uint32_t value;
7774 const char *name;
7775 const char *description;
7776 } TestGroupCase;
7777
7778 TestGroupCase ttCases[] = {
7779 {TT_SUCF_ELECT, "subgroup_uniform_control_flow_elect", "subgroup_uniform_control_flow_elect"},
7780 {TT_SUCF_BALLOT, "subgroup_uniform_control_flow_ballot", "subgroup_uniform_control_flow_ballot"},
7781 {TT_WUCF_ELECT, "workgroup_uniform_control_flow_elect", "workgroup_uniform_control_flow_elect"},
7782 {TT_WUCF_BALLOT, "workgroup_uniform_control_flow_ballot", "workgroup_uniform_control_flow_ballot"},
7783 {TT_MAXIMAL, "maximal", "maximal"},
7784 };
7785
7786 std::pair<VkShaderStageFlagBits, const char *> const stTypes[]{
7787 {VK_SHADER_STAGE_COMPUTE_BIT, "compute"},
7788 {VK_SHADER_STAGE_FRAGMENT_BIT, "fragment"},
7789 #ifdef INCLUDE_GRAPHICS_TESTS
7790 {VK_SHADER_STAGE_VERTEX_BIT, "vertex"},
7791 {VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT, "tessctrl"},
7792 {VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT, "tesseval"},
7793 {VK_SHADER_STAGE_GEOMETRY_BIT, "geometry"},
7794 #endif
7795 };
7796
7797 for (int ttNdx = 0; ttNdx < DE_LENGTH_OF_ARRAY(ttCases); ttNdx++)
7798 {
7799 de::MovePtr<tcu::TestCaseGroup> ttGroup(
7800 new tcu::TestCaseGroup(testCtx, ttCases[ttNdx].name, ttCases[ttNdx].description));
7801
7802 for (int stNdx = 0; stNdx < DE_LENGTH_OF_ARRAY(stTypes); ++stNdx)
7803 {
7804 // Only 'maximal' tests can process this loop when we are dealing with various kind of shaders,
7805 if (stTypes[stNdx].first != VK_SHADER_STAGE_COMPUTE_BIT && ttCases[ttNdx].value != TT_MAXIMAL)
7806 continue;
7807
7808 de::MovePtr<tcu::TestCaseGroup> shaderGroup(new tcu::TestCaseGroup(testCtx, stTypes[stNdx].second, ""));
7809
7810 uint32_t nNdx = 2;
7811
7812 if (stTypes[stNdx].first == VK_SHADER_STAGE_FRAGMENT_BIT)
7813 {
7814 nNdx = 7;
7815 createAmberFragmentTestCases(testCtx, shaderGroup.get());
7816 }
7817
7818 for (/*uint32_t nNdx = 2*/; nNdx <= 6; nNdx++)
7819 {
7820 de::MovePtr<tcu::TestCaseGroup> nestGroup(
7821 new tcu::TestCaseGroup(testCtx, ("nesting" + de::toString(nNdx)).c_str(), ""));
7822
7823 uint32_t seed = 0;
7824
7825 for (int sNdx = 0; sNdx < 8; sNdx++)
7826 {
7827 de::MovePtr<tcu::TestCaseGroup> seedGroup(
7828 new tcu::TestCaseGroup(testCtx, de::toString(sNdx).c_str(), ""));
7829
7830 uint32_t numTests = 0;
7831 switch (nNdx)
7832 {
7833 default:
7834 DE_ASSERT(0);
7835 // fallthrough
7836 case 2:
7837 case 3:
7838 case 4:
7839 numTests = 250;
7840 break;
7841 case 5:
7842 numTests = 100;
7843 break;
7844 case 6:
7845 numTests = 50;
7846 break;
7847 }
7848
7849 if (ttCases[ttNdx].value != TT_MAXIMAL)
7850 {
7851 if (nNdx >= 5)
7852 continue;
7853 }
7854
7855 for (uint32_t ndx = 0; ndx < numTests; ndx++)
7856 {
7857 uint32_t dim = 0u;
7858 DE_UNREF(dim);
7859 uint32_t sizeX = 0u;
7860 uint32_t sizeY = 0u;
7861 switch (stTypes[stNdx].first)
7862 {
7863 case VK_SHADER_STAGE_COMPUTE_BIT:
7864 // we want to test at least full subgroup
7865 // both are primary numbers
7866 sizeX = 7u;
7867 sizeY = 13u;
7868 break;
7869 case VK_SHADER_STAGE_FRAGMENT_BIT:
7870 sizeX = 32;
7871 sizeY = 32;
7872 break;
7873 case VK_SHADER_STAGE_VERTEX_BIT:
7874 // we want to test at least full subgroup
7875 dim = uint32_t(std::ceil(
7876 std::sqrt((double)(((128u + 31u) * 100u) / VertexRandomProgram::fillPercentage))));
7877 sizeX = dim;
7878 sizeY = dim;
7879 break;
7880 case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
7881 sizeX = 19; // positive number of desired subgroups
7882 sizeY = 1; // used only for framebuffer extent in TCS test
7883 break;
7884 case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
7885 sizeX = 23; // positive number of desired subgroups
7886 sizeY = 1; // used only for framebuffer extent in TES test
7887 break;
7888 case VK_SHADER_STAGE_GEOMETRY_BIT:
7889 // we want to test at least full subgroup
7890 dim = uint32_t(std::ceil(
7891 std::sqrt((double)(((128u + 29u) * 100u) / GeometryRandomProgram::fillPercentage))));
7892 sizeX = dim;
7893 sizeY = dim;
7894 break;
7895 default:
7896 DE_ASSERT(0);
7897 }
7898 CaseDef c = {
7899 stTypes[stNdx].first, // VkShaderStageFlagBits shaderStage
7900 (TestType)ttCases[ttNdx].value, // TestType testType;
7901 nNdx, // uint32_t maxNesting;
7902 seed, // uint32_t seed;
7903 sizeX, // uint32_t sizeX;
7904 sizeY // uint32_t sizeY;
7905 };
7906 // product of sizeX and sizeY must not exceed MAX_INVOCATIONS_ALL_TESTS
7907 DE_ASSERT(c.verify());
7908 seed++;
7909
7910 bool isExperimentalTest = (ndx >= numTests / 5);
7911
7912 if (createExperimental == isExperimentalTest)
7913 seedGroup->addChild(new ReconvergenceTestCase(testCtx, de::toString(ndx).c_str(), c));
7914 }
7915 if (!seedGroup->empty())
7916 nestGroup->addChild(seedGroup.release());
7917 }
7918 if (!nestGroup->empty())
7919 shaderGroup->addChild(nestGroup.release());
7920 }
7921 if (!shaderGroup->empty())
7922 ttGroup->addChild(shaderGroup.release());
7923 }
7924 group->addChild(ttGroup.release());
7925 }
7926
7927 return group.release();
7928 }
7929
createAmberFragmentTestCases(add_ref<tcu::TestContext> testCtx,add_ptr<tcu::TestCaseGroup> group)7930 void createAmberFragmentTestCases(add_ref<tcu::TestContext> testCtx, add_ptr<tcu::TestCaseGroup> group)
7931 {
7932 using namespace cts_amber;
7933
7934 enum Tests
7935 {
7936 TERMINATE_INVOCATION,
7937 DEMOTE_INVOCATION,
7938 DEMOTE_ENTIRE_QUAD,
7939 DEMOTE_HALF_QUAD_TOP,
7940 DEMOTE_HALF_QUAD_RIGHT,
7941 DEMOTE_HALF_QUAD_BOTTOM,
7942 DEMOTE_HALF_QUAD_LEFT,
7943 DEMOTE_HALF_QUAD_SLASH,
7944 DEMOTE_HALF_QUAD_BACKSLASH
7945 };
7946
7947 struct Case
7948 {
7949 Tests test;
7950 add_cptr<char> name;
7951 add_cptr<char> desc;
7952 std::size_t hname;
7953 Case(Tests aTest, add_cptr<char> aName, add_cptr<char> aDesc)
7954 : test(aTest)
7955 , name(aName)
7956 , desc(aDesc)
7957 , hname(std::hash<std::string>()(std::string(aName)))
7958 {
7959 }
7960 bool matches(add_cref<std::string> aName) const
7961 {
7962 return hname == std::hash<std::string>()(aName);
7963 }
7964 static bool matches(add_cref<std::string> aName, std::initializer_list<Case> aList)
7965 {
7966 for (auto i = aList.begin(); i != aList.end(); ++i)
7967 {
7968 if (i->matches(aName))
7969 return true;
7970 }
7971 return false;
7972 }
7973 std::string makeFileName() const
7974 {
7975 return (std::string(name) + ".amber");
7976 }
7977 } static const cases[]{
7978 Case(TERMINATE_INVOCATION, "terminate_invocation",
7979 "Verifies that terminated invocation is no longer included in the ballot"),
7980 Case(DEMOTE_INVOCATION, "demote_invocation",
7981 "Verifies that the demoted invocation is not present in the ballot"),
7982 Case(DEMOTE_ENTIRE_QUAD, "demote_entire_quad", "Verifies that the demoted quad is not present in the ballot"),
7983 Case(DEMOTE_HALF_QUAD_TOP, "demote_half_quad_top",
7984 "Verifies that the demoted part of the quad is not present in the ballot"),
7985 Case(DEMOTE_HALF_QUAD_RIGHT, "demote_half_quad_right",
7986 "Verifies that the demoted part of the quad is not present in the ballot"),
7987 Case(DEMOTE_HALF_QUAD_BOTTOM, "demote_half_quad_bottom",
7988 "Verifies that the demoted part of the quad is not present in the ballot"),
7989 Case(DEMOTE_HALF_QUAD_LEFT, "demote_half_quad_left",
7990 "Verifies that the demoted part of the quad is not present in the ballot"),
7991 Case(DEMOTE_HALF_QUAD_SLASH, "demote_half_quad_slash",
7992 "Verifies that the demoted part of the quad is not present in the ballot"),
7993 Case(DEMOTE_HALF_QUAD_BACKSLASH, "demote_half_quad_backslash",
7994 "Verifies that the demoted part of the quad is not present in the ballot"),
7995 };
7996
7997 auto testSupports = [](Context &context, std::string testName) -> void
7998 {
7999 if (!(context.getSubgroupProperties().supportedStages & VK_SHADER_STAGE_FRAGMENT_BIT))
8000 TCU_THROW(NotSupportedError, "Subgroup operations not supported in fragment stage");
8001
8002 if (!context.getShaderMaximalReconvergenceFeatures().shaderMaximalReconvergence)
8003 TCU_THROW(NotSupportedError, "shaderMaximalReconvergence not supported");
8004
8005 if (!(context.getSubgroupProperties().supportedOperations & VK_SUBGROUP_FEATURE_BALLOT_BIT))
8006 TCU_THROW(NotSupportedError, "VK_SUBGROUP_FEATURE_BALLOT_BIT not supported");
8007
8008 if (Case::matches(testName, {cases[DEMOTE_ENTIRE_QUAD]}))
8009 {
8010 if (!(context.getSubgroupProperties().subgroupSize > 4))
8011 TCU_THROW(NotSupportedError, "subgroupSize is less than or equal to 4");
8012 }
8013 else
8014 {
8015 if (!(context.getSubgroupProperties().subgroupSize >= 4))
8016 TCU_THROW(NotSupportedError, "subgroupSize is less than 4");
8017 }
8018
8019 if (Case::matches(testName, {cases[TERMINATE_INVOCATION]}))
8020 {
8021 if (!context.getShaderTerminateInvocationFeatures().shaderTerminateInvocation)
8022 TCU_THROW(NotSupportedError, "shaderTerminateInvocation not supported.");
8023 }
8024 else
8025 {
8026 #ifndef CTS_USES_VULKANSC
8027 if (!context.getShaderDemoteToHelperInvocationFeatures().shaderDemoteToHelperInvocation)
8028 TCU_THROW(NotSupportedError, "demoteToHelperInvocation not supported.");
8029 #else
8030 if (!context.getShaderDemoteToHelperInvocationFeaturesEXT().shaderDemoteToHelperInvocation)
8031 TCU_THROW(NotSupportedError, "demoteToHelperInvocation not supported.");
8032 #endif
8033 }
8034 };
8035
8036 auto updateTest = [&](add_ptr<AmberTestCase> theTest) -> add_ptr<AmberTestCase>
8037 {
8038 theTest->setCheckSupportCallback(testSupports);
8039 return theTest;
8040 };
8041
8042 const std::string testsFolder(std::string("reconvergence/maximal/") + group->getName());
8043
8044 for (add_cref<Case> aCase : cases)
8045 {
8046 group->addChild(updateTest(
8047 createAmberTestCase(testCtx, aCase.name, aCase.desc, testsFolder.c_str(), aCase.makeFileName())));
8048 }
8049 }
8050
8051 } // namespace
8052
createTests(tcu::TestContext & testCtx,const std::string & name)8053 tcu::TestCaseGroup *createTests(tcu::TestContext &testCtx, const std::string &name)
8054 {
8055 return createTests(testCtx, name, false);
8056 }
8057
createTestsExperimental(tcu::TestContext & testCtx,const std::string & name)8058 tcu::TestCaseGroup *createTestsExperimental(tcu::TestContext &testCtx, const std::string &name)
8059 {
8060 return createTests(testCtx, name, true);
8061 }
8062
8063 } // namespace Reconvergence
8064 } // namespace vkt
8065