• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 Google LLC. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef RUY_RUY_TEST_H_
17 #define RUY_RUY_TEST_H_
18 
19 #include <math.h>
20 
21 #include <algorithm>
22 #include <cstddef>
23 #include <cstdint>
24 #include <cstdio>
25 #include <cstdlib>
26 #include <ctime>
27 #include <iostream>
28 #include <iterator>
29 #include <limits>
30 #include <memory>
31 #include <numeric>
32 #include <random>
33 #include <set>
34 #include <sstream>
35 #include <string>
36 #include <tuple>
37 #include <type_traits>
38 #include <vector>
39 
40 #include "ruy/allocator.h"
41 #include "ruy/context.h"
42 #include "ruy/context_get_ctx.h"
43 #include "ruy/ctx.h"
44 #include "ruy/gtest_wrapper.h"  // IWYU pragma: export
45 #include "ruy/matrix.h"         // IWYU pragma: export
46 #include "ruy/mul_params.h"     // IWYU pragma: export
47 #include "ruy/pack_common.h"
48 #include "ruy/platform.h"
49 #include "ruy/pmu.h"
50 #include "ruy/reference_mul.h"
51 #include "ruy/ruy.h"
52 #include "ruy/size_util.h"
53 #include "ruy/time.h"
54 
55 #ifdef RUY_TEST_EXTERNAL_PATHS
56 #define EIGEN_USE_THREADS
57 #define EIGEN_USE_CUSTOM_THREAD_POOL
58 #pragma GCC diagnostic push
59 #pragma GCC diagnostic ignored "-Wunused-parameter"
60 #include "third_party/eigen3/Eigen/Core"
61 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
62 #pragma GCC diagnostic pop
63 #include "third_party/gemmlowp/public/gemmlowp.h"
64 #include "third_party/lapack/blas.h"
65 #endif
66 
67 #ifdef RUY_PROFILER
68 #include "ruy/profiler/profiler.h"
69 #endif
70 
71 #ifdef __linux__
72 #include <sys/mman.h>
73 #include <unistd.h>
74 #endif
75 
76 namespace ruy {
77 
78 const float kClampRatio = 0.1f;
79 
80 enum class ExternalPath {
81   kNone,
82   kReference,
83   kGemmlowp,
84   kEigen,
85   kEigenTensor,
86   kOpenBlas
87 };
88 
CoveredPaths()89 inline std::vector<std::string>* CoveredPaths() {
90   static std::vector<std::string> covered_paths;
91   return &covered_paths;
92 }
93 
PathName(Path path)94 inline const char* PathName(Path path) {
95 #define RUY_PATHNAME_CASE(NAME) \
96   case Path::NAME:              \
97     return #NAME;
98   switch (path) {
99     RUY_PATHNAME_CASE(kStandardCpp)
100     RUY_PATHNAME_CASE(kInternalStandardCppVariant1)
101     RUY_PATHNAME_CASE(kInternalStandardCppVariant2)
102     RUY_PATHNAME_CASE(kInternalStandardCppVariant3)
103 
104 #if RUY_PLATFORM_NEON
105     RUY_PATHNAME_CASE(kNeon)
106     RUY_PATHNAME_CASE(kNeonDotprod)
107 #elif RUY_PLATFORM_X86
108     RUY_PATHNAME_CASE(kAvx2Fma)
109     RUY_PATHNAME_CASE(kAvx512)
110     RUY_PATHNAME_CASE(kAvx)
111 #endif
112     default:
113       RUY_CHECK(false);
114       return nullptr;
115   }
116 #undef RUY_PATHNAME_CASE
117 }
118 
TuningName(Tuning tuning)119 inline const char* TuningName(Tuning tuning) {
120 #define RUY_SUBPATHNAME_CASE(NAME) \
121   case Tuning::NAME:               \
122     return #NAME;
123   switch (tuning) {
124     RUY_SUBPATHNAME_CASE(kA55ish)
125     RUY_SUBPATHNAME_CASE(kX1)
126     RUY_SUBPATHNAME_CASE(kGeneric)
127     default:
128       RUY_CHECK(false);
129       return nullptr;
130   }
131 #undef RUY_SUBPATHNAME_CASE
132 }
133 
PathName(ExternalPath path)134 inline const char* PathName(ExternalPath path) {
135 #define RUY_PATHNAME_CASE(NAME) \
136   case ExternalPath::NAME:      \
137     return #NAME;
138   switch (path) {
139     RUY_PATHNAME_CASE(kReference)
140     RUY_PATHNAME_CASE(kGemmlowp)
141     RUY_PATHNAME_CASE(kEigen)
142     RUY_PATHNAME_CASE(kEigenTensor)
143     RUY_PATHNAME_CASE(kOpenBlas)
144     default:
145       RUY_CHECK(false);
146       return nullptr;
147   }
148 #undef RUY_PATHNAME_CASE
149 }
150 
151 inline std::ostream& operator<<(std::ostream& stream, Path path) {
152   return stream << PathName(path);
153 }
154 
155 inline std::ostream& operator<<(std::ostream& stream,
156                                 ExternalPath external_path) {
157   return stream << PathName(external_path);
158 }
159 
160 template <typename ContainerType>
Join(const ContainerType & container)161 std::string Join(const ContainerType& container) {
162   if (container.empty()) {
163     return "<empty>";
164   }
165   std::ostringstream stream;
166   auto it = container.begin();
167   stream << *it++;
168   for (; it != container.end(); ++it) {
169     stream << ", ";
170     stream << *it;
171   }
172   return stream.str();
173 }
174 
175 struct LogCoveredPathsOnDestruction final {
~LogCoveredPathsOnDestructionfinal176   ~LogCoveredPathsOnDestruction() {
177     std::cerr << "Covered paths: " << Join(*CoveredPaths()) << std::endl;
178 
179     // When we know that it would be abnormal for some path not to be covered,
180     // we check it here. Accidentally disabling SIMD paths has occurred in the
181     // past and is one of the biggest performance regressions imaginable.
182     //
183     // TODO: we should be able to require some x86 paths as well, at least
184     // SSE4.2.
185 
186 #if RUY_PLATFORM_ARM
187     // When testing on ARM32 or ARM64, make sure that we covered the NEON path.
188     // NEON is always available on ARM64, and we treat it as always available
189     // also on ARM32.
190     bool found_neon = false;
191     for (const std::string& covered_path : *CoveredPaths()) {
192       if (covered_path == "kNeon") {
193         found_neon = true;
194       }
195     }
196     if (!found_neon) {
197       std::cerr
198           << "Error: we haven't tested the kNeon path as we should have.\n"
199           << std::endl;
200       abort();
201     }
202 #endif
203 
204     // When testing on ARM64 ChromiumOS emulator, make sure that we covered
205     // the dotprod path. We're getting such coverage at the moment thanks to
206     // using a sufficiently recent emulator, and we don't want to regress that.
207 #if RUY_PLATFORM_ARM_64 && defined RUY_TESTING_ON_CHROMIUMOS
208     bool found_dotprod = false;
209     for (const std::string& covered_path : *CoveredPaths()) {
210       if (covered_path == "kNeonDotprod") {
211         found_dotprod = true;
212       }
213     }
214     if (!found_dotprod) {
215       std::cerr
216           << "Error: we haven't tested the kNeonDotprod path as we should "
217              "have. At the moment, this is required on ChromiumOS as this is "
218              "what we run emulator tests in, that currently supports "
219              "dot-product "
220              "instructions, and we care very much about not regressing that. "
221              "If this test was run in an emulator, please upgrade to a newer "
222              "emulator version. If this test was run on an actual device, and "
223              "you need to be able to run ruy tests on devices not supporting "
224              "dot-product instructions, get in touch with us.\n"
225           << std::endl;
226       abort();
227     }
228 #endif
229   }
Singletonfinal230   static void Singleton() { static LogCoveredPathsOnDestruction singleton; }
231 };
232 
233 enum class RandomRange {
234   kGeneral,
235   kAvoidMinValue,
236   kOffCenterAvoidMinValue,
237   kReasonableSrcZeroPoint,
238   kReasonableDstZeroPoint,
239   kBias
240 };
241 
242 template <typename Scalar,
243           bool IsFloatingPoint = std::is_floating_point<Scalar>::value>
244 struct RandomRangeBounds {};
245 
246 template <typename Scalar>
247 struct RandomRangeBounds<Scalar, true> {
248   static Scalar GetMinBound(RandomRange range) {
249     switch (range) {
250       case RandomRange::kGeneral:
251         return -1;
252       case RandomRange::kAvoidMinValue:
253         return -1;
254       case RandomRange::kOffCenterAvoidMinValue:
255         return -1;
256       case RandomRange::kReasonableSrcZeroPoint:
257         return 0;
258       case RandomRange::kReasonableDstZeroPoint:
259         return 0;
260       case RandomRange::kBias:
261         return -1;
262       default:
263         RUY_CHECK(false);
264         return 0;
265     }
266   }
267   static Scalar GetMaxBound(RandomRange range) {
268     switch (range) {
269       case RandomRange::kGeneral:
270         return 1;
271       case RandomRange::kAvoidMinValue:
272         return 1;
273       case RandomRange::kOffCenterAvoidMinValue:
274         return 1;
275       case RandomRange::kReasonableSrcZeroPoint:
276         return 0;
277       case RandomRange::kReasonableDstZeroPoint:
278         return 0;
279       case RandomRange::kBias:
280         return 1;
281       default:
282         RUY_CHECK(false);
283         return 0;
284     }
285   }
286 };
287 
288 template <typename Scalar>
289 Scalar WeightedSum(Scalar s1, float weight1, Scalar s2, float weight2) {
290   float sum = s1 * weight1 + s2 * weight2;
291   float clamped = std::min<float>(
292       std::numeric_limits<Scalar>::max(),
293       std::max<float>(std::numeric_limits<Scalar>::lowest(), sum));
294   return static_cast<Scalar>(clamped);
295 }
296 
297 template <typename Scalar>
298 Scalar Parametrized(float param) {
299   return WeightedSum(std::numeric_limits<Scalar>::max(), param,
300                      std::numeric_limits<Scalar>::lowest(), 1 - param);
301 }
302 
303 template <typename Scalar>
304 struct RandomRangeBounds<Scalar, false> {
305   static Scalar GetMinBound(RandomRange range) {
306     static constexpr double offcenteredness =
307         0.02;  // Shift lower limit by about 5 for range of 255.
308     switch (range) {
309       case RandomRange::kGeneral:
310         return std::numeric_limits<Scalar>::lowest();
311       case RandomRange::kAvoidMinValue:
312         return 1 + std::numeric_limits<Scalar>::lowest();
313       case RandomRange::kOffCenterAvoidMinValue:
314         return 1 + std::numeric_limits<Scalar>::lowest() +
315                static_cast<Scalar>(
316                    offcenteredness * std::numeric_limits<Scalar>::max() -
317                    offcenteredness *
318                        (std::numeric_limits<Scalar>::lowest() + 1));
319       case RandomRange::kReasonableSrcZeroPoint:
320         return std::numeric_limits<Scalar>::lowest();
321       case RandomRange::kReasonableDstZeroPoint:
322         return Parametrized<Scalar>(0.4);
323       case RandomRange::kBias:
324         return std::is_same<Scalar, std::int32_t>::value
325                    ? static_cast<Scalar>(-10000)
326                    : 0;
327       default:
328         RUY_CHECK(false);
329         return 0;
330     }
331   }
332   static Scalar GetMaxBound(RandomRange range) {
333     switch (range) {
334       case RandomRange::kGeneral:
335         return std::numeric_limits<Scalar>::max();
336       case RandomRange::kAvoidMinValue:
337         return std::numeric_limits<Scalar>::max();
338       case RandomRange::kOffCenterAvoidMinValue:
339         return std::numeric_limits<Scalar>::max();
340       case RandomRange::kReasonableSrcZeroPoint:
341         return std::numeric_limits<Scalar>::max();
342       case RandomRange::kReasonableDstZeroPoint:
343         return Parametrized<Scalar>(0.6);
344       case RandomRange::kBias:
345         return std::is_same<Scalar, std::int32_t>::value
346                    ? static_cast<Scalar>(10000)
347                    : 0;
348       default:
349         RUY_CHECK(false);
350         return 0;
351     }
352   }
353 };
354 
355 inline std::default_random_engine& global_random_engine() {
356   static std::default_random_engine engine;
357   return engine;
358 }
359 
360 template <typename Scalar>
361 struct UniformRandomDistribution {
362   UniformRandomDistribution(RandomRange range)
363       : dist(RandomRangeBounds<Scalar>::GetMinBound(range),
364              RandomRangeBounds<Scalar>::GetMaxBound(range)) {}
365   Scalar Get() { return dist(global_random_engine()); }
366   // std::uniform_int_distribution is specified not to support char types,
367   // only short and wider types. MSVC actually generates an error on
368   // std::uniform_int_distribution<std::int8_t>.
369   using StdDistType = typename std::conditional<
370       std::is_floating_point<Scalar>::value,
371       std::uniform_real_distribution<Scalar>,
372       std::uniform_int_distribution<std::int32_t>>::type;
373   StdDistType dist;
374 };
375 
376 template <typename Scalar>
377 void MakeRandomScalar(UniformRandomDistribution<Scalar>* uniform_dist,
378                       Scalar* dst) {
379   *dst = uniform_dist->Get();
380 }
381 
382 #if defined(__has_feature)
383 #if __has_feature(address_sanitizer)
384 #define RUY_TEST_BUILT_WITH_ASAN
385 #endif
386 #endif
387 
388 // Don't use separate mappings when building with Address Sanitizer, as the
389 // manual mappings could hide actual address errors from ASan (ASan can't see
390 // the actual buffer valid address range inside the manual mapping).
391 #if defined __linux__ && !defined RUY_TEST_BUILT_WITH_ASAN
392 #define RUY_TEST_USE_SEPARATE_MAPPINGS
393 #endif
394 
395 template <typename T>
396 struct SeparateMappingAllocator {
397   using value_type = T;
398 
399   T* allocate(std::size_t n) {
400 #ifdef RUY_TEST_USE_SEPARATE_MAPPINGS
401     const std::size_t page_size = getpagesize();
402     std::size_t buffer_size = n * sizeof(T);
403     std::size_t rounded_buffer_size = round_up_pot(buffer_size, page_size);
404     // We are going to map an additional page at the end of our buffer, then
405     // unmap it, to ensure that our buffer's end is the last mapped byte, so as
406     // to catch any overrun.
407     void* mapping =
408         mmap(nullptr, rounded_buffer_size + page_size, PROT_READ | PROT_WRITE,
409              MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
410     RUY_CHECK_NE(mapping, MAP_FAILED);
411     int unmap_result =
412         munmap(static_cast<char*>(mapping) + rounded_buffer_size, page_size);
413     RUY_CHECK_EQ(unmap_result, 0);
414     // Clearing bytes should be redundant since mmap has do do it already, but
415     // it does not hurt and acts as an assertion checking that we got the above
416     // mapping and unmapping right.
417     std::memset(mapping, 0, rounded_buffer_size);
418     // Compute the offset to make our buffer end at the last mapped byte.
419     std::size_t buffer_offset = rounded_buffer_size - buffer_size;
420     void* buffer =
421         static_cast<void*>(static_cast<char*>(mapping) + buffer_offset);
422     return static_cast<T*>(buffer);
423 #else
424     T* ret = new T[n];
425     std::memset(ret, 0, n * sizeof(T));
426     return ret;
427 #endif
428   }
429   void deallocate(T* p, std::size_t n) {
430 #ifdef RUY_TEST_USE_SEPARATE_MAPPINGS
431     // The mapped bytes are the buffer address range, rounded on both ends
432     // to page boundary.
433     const std::size_t page_size = getpagesize();
434     std::size_t buffer_size = n * sizeof(T);
435     std::size_t rounded_buffer_size = round_up_pot(buffer_size, page_size);
436     std::uintptr_t p_addr = reinterpret_cast<std::uintptr_t>(p);
437     void* mapping = reinterpret_cast<void*>(p_addr & ~(page_size - 1));
438     int ret = munmap(mapping, rounded_buffer_size);
439     RUY_CHECK_EQ(ret, 0);
440 #else
441     (void)n;
442     delete[] p;
443 #endif
444   }
445 };
446 
447 template <typename T>
448 using SeparateMappingVector = std::vector<T, SeparateMappingAllocator<T>>;
449 
450 template <typename Scalar, typename Allocator>
451 void MakeRandomVector(UniformRandomDistribution<Scalar>* uniform_dist, int size,
452                       std::vector<Scalar, Allocator>* dst) {
453   dst->resize(size);
454   for (auto& x : *dst) {
455     MakeRandomScalar(uniform_dist, &x);
456   }
457 }
458 
459 template <typename Scalar>
460 void MakeRandomScalar(RandomRange range, Scalar* dst) {
461   UniformRandomDistribution<Scalar> dist(range);
462   *dst = dist.Get();
463   if (range == RandomRange::kReasonableDstZeroPoint ||
464       range == RandomRange::kReasonableSrcZeroPoint) {
465     if (global_random_engine()() & 1) {
466       *dst = SymmetricZeroPoint<Scalar>();
467     }
468   }
469 }
470 
471 template <typename Scalar, typename Allocator>
472 void MakeRandomVector(RandomRange range, int size,
473                       std::vector<Scalar, Allocator>* dst) {
474   UniformRandomDistribution<Scalar> dist(range);
475   dst->resize(size);
476   for (auto& x : *dst) {
477     MakeRandomScalar(&dist, &x);
478   }
479 }
480 
481 enum class LayoutStyle { kUnstridedLinear, kLinear };
482 
483 inline void MakeLayout(int rows, int cols, Order order,
484                        LayoutStyle layout_style, Layout* layout) {
485   layout->set_rows(rows);
486   layout->set_cols(cols);
487   layout->set_order(order);
488 
489   const int min_stride = order == Order::kColMajor ? rows : cols;
490 
491   RUY_CHECK(layout_style == LayoutStyle::kUnstridedLinear ||
492             layout_style == LayoutStyle::kLinear);
493   if (layout_style == LayoutStyle::kUnstridedLinear) {
494     layout->set_stride(min_stride);
495   } else {
496     layout->set_stride(min_stride + 1);
497   }
498 }
499 
500 template <typename Scalar>
501 struct StorageMatrix {
502   StorageMatrix() = default;
503   StorageMatrix(const StorageMatrix&) = delete;
504   SeparateMappingVector<Scalar> data;
505   Matrix<Scalar> matrix;
506 };
507 
508 inline bool IsUnstrided(const Layout& layout) {
509   if (layout.order() == Order::kColMajor) {
510     return layout.stride() == layout.rows();
511   } else {
512     return layout.stride() == layout.cols();
513   }
514 }
515 
516 inline bool IsRowMajor(const Layout& layout) {
517   return layout.order() == Order::kRowMajor;
518 }
519 
520 inline bool IsColMajor(const Layout& layout) {
521   return layout.order() == Order::kColMajor;
522 }
523 
524 inline int FlatSize(const Layout& layout) {
525   const int outerdim =
526       layout.order() == Order::kColMajor ? layout.cols() : layout.rows();
527   return layout.stride() * outerdim;
528 }
529 
530 template <typename Scalar>
531 void VerifyConsistentFields(const StorageMatrix<Scalar>& storage_matrix) {
532   if (storage_matrix.data.empty()) {
533     RUY_CHECK_EQ(storage_matrix.matrix.data(), nullptr);
534     RUY_CHECK_EQ(storage_matrix.matrix.layout().rows(), 0);
535     RUY_CHECK_EQ(storage_matrix.matrix.layout().cols(), 0);
536   } else {
537     RUY_CHECK_EQ(storage_matrix.matrix.data(), storage_matrix.data.data());
538     RUY_CHECK_EQ(FlatSize(storage_matrix.matrix.layout()),
539                  static_cast<int>(storage_matrix.data.size()));
540   }
541 }
542 
543 template <typename Scalar>
544 void MakeRandom(int rows, int cols, Order order, Scalar zero_point,
545                 LayoutStyle layout_style, RandomRange range,
546                 StorageMatrix<Scalar>* storage_matrix) {
547   MakeLayout(rows, cols, order, layout_style,
548              storage_matrix->matrix.mutable_layout());
549   storage_matrix->matrix.set_zero_point(zero_point);
550   UniformRandomDistribution<Scalar> data_dist(range);
551   MakeRandomVector(&data_dist, FlatSize(storage_matrix->matrix.layout()),
552                    &storage_matrix->data);
553   storage_matrix->matrix.set_data(storage_matrix->data.data());
554   VerifyConsistentFields(*storage_matrix);
555 }
556 
557 template <typename Scalar>
558 struct TestResult {
559   void operator=(const TestResult&) = delete;
560   void operator=(const TestResult&&) = delete;
561   StorageMatrix<Scalar> storage_matrix;
562   Path path = Path::kNone;
563   Tuning tuning = Tuning::kAuto;
564   ExternalPath external_path = ExternalPath::kNone;
565   float latency;
566   float l1_refill_rate;
567   float l2_refill_rate;
568   float l3_refill_rate;
569   float l1tlb_refill_rate;
570   float l2tlb_refill_rate;
571   float mispred_rate;
572   float frontend_stall_rate;
573   float backend_stall_rate;
574 };
575 
576 template <typename Scalar>
577 std::string PathName(const TestResult<Scalar>& result) {
578   std::string pathname;
579   if (result.path != Path::kNone) {
580     pathname.assign(PathName(result.path));
581   } else if (result.external_path != ExternalPath::kNone) {
582     pathname.assign(PathName(result.external_path));
583   } else {
584     RUY_CHECK(false);
585   }
586   if (result.tuning != Tuning::kAuto) {
587     pathname.append("/");
588     pathname.append(TuningName(result.tuning));
589   }
590   return pathname;
591 }
592 
593 template <typename tLhsScalar, typename tRhsScalar, typename tAccumScalar,
594           typename tDstScalar>
595 struct TestSet final {
596   using LhsScalar = tLhsScalar;
597   using RhsScalar = tRhsScalar;
598   using AccumScalar = tAccumScalar;
599   using DstScalar = tDstScalar;
600   using MulParamsType = MulParams<AccumScalar, DstScalar>;
601   using TestResultType = TestResult<DstScalar>;
602 
603   void Run() {
604     MakeZeroPoints();
605     MakeLhsRhs();
606     MakeMulParams();
607     MakeOtherParams();
608     MakeResultPaths();
609     Eval();
610     Verify();
611   }
612 
613  private:
614   void MakeZeroPoints();
615   void MakeLhsRhs();
616   void MakeMulParams();
617   void MakeResultPaths();
618   void MakeOtherParams();
619   void EvalAndVerify();
620   void Eval();
621   void Verify();
622 
623   void EvalResult(TestResultType* result);
624   void EvalRuy(TestResultType* result);
625   void DoMul(TestResultType* result);
626   void Benchmark(TestResultType* result);
627   void VerifyTestResults() const;
628 
629  public:
630   enum class LifeStage {
631     kInitial,
632     kHasZeroPoints,
633     kHasLhsRhs,
634     kHasMulParams,
635     kHasOtherParams,
636     kHasResultPaths,
637     kEvaluated,
638     kFinal
639   };
640 
641   ~TestSet();
642 
643   LifeStage life_stage = LifeStage::kInitial;
644 
645   int rows = 0;
646   int cols = 0;
647   int depth = 0;
648   Order lhs_order = Order::kRowMajor;
649   Order rhs_order = Order::kColMajor;
650   Order dst_order = Order::kColMajor;
651   LayoutStyle layout_style = LayoutStyle::kUnstridedLinear;
652 
653   bool use_specified_zero_points = false;
654   LhsScalar lhs_zero_point = 0;
655   RhsScalar rhs_zero_point = 0;
656   DstScalar dst_zero_point = 0;
657 
658   SeparateMappingVector<AccumScalar> per_channel_multiplier_fixedpoint;
659   SeparateMappingVector<int> per_channel_multiplier_exponent;
660 
661   StorageMatrix<LhsScalar> lhs;
662   StorageMatrix<RhsScalar> rhs;
663   MulParamsType mul_params;
664   SeparateMappingVector<AccumScalar> bias_data;
665   std::vector<std::unique_ptr<TestResultType>> results;
666 
667   std::vector<Path> paths;
668   std::vector<ExternalPath> external_paths;
669 
670   bool benchmark = false;
671   bool perchannel = false;
672   int max_num_threads = 0;
673 
674   bool cache_lhs = false;
675   bool cache_rhs = false;
676 };
677 
678 inline PmuEvents& GlobalPmuEvents() {
679   static PmuEvents pmu;
680   return pmu;
681 }
682 
683 inline Context& GlobalContext() {
684   // Ensure that GlobalPmuEvents is constructed before we create any context.
685   // This ensures that pmu counters are opened before we create any worker
686   // thread, which is necessary to count events from worker threads.
687   GlobalPmuEvents();
688 
689   static Context context;
690   return context;
691 }
692 
693 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
694           typename DstScalar>
695 TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::~TestSet() {
696   RUY_CHECK_EQ(life_stage, LifeStage::kFinal);
697   LogCoveredPathsOnDestruction::Singleton();
698   GlobalContext().ClearPrepackedCache();
699 }
700 
701 #if defined(__has_feature)
702 #if __has_feature(thread_sanitizer)
703 #define RUY_TSAN
704 #endif
705 #if __has_feature(address_sanitizer)
706 #define RUY_ASAN
707 #endif
708 #endif  // defined(__has_feature)
709 
710 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
711           typename DstScalar>
712 void TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::DoMul(
713     TestResultType* result) {
714   Mul<kAllPathsIncludingInternalVariants>(lhs.matrix, rhs.matrix, mul_params,
715                                           &GlobalContext(),
716                                           &result->storage_matrix.matrix);
717 }
718 
719 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
720           typename DstScalar>
721 void TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::EvalRuy(
722     TestResultType* result) {
723   GlobalContext().set_explicit_tuning(result->tuning);
724   if (max_num_threads) {
725     GlobalContext().set_max_num_threads(max_num_threads);
726   } else if (benchmark) {
727     GlobalContext().set_max_num_threads(1);
728   } else {
729     GlobalContext().set_max_num_threads(1 + global_random_engine()() % 8);
730   }
731   get_ctx(&GlobalContext())->SetRuntimeEnabledPaths(result->path);
732   DoMul(result);
733   // If enabling caching, Mul is stateful, so we run it a second time to get
734   // coverage of these aspects.
735   if (!benchmark && (cache_lhs || cache_rhs)) {
736     DoMul(result);
737   }
738   RUY_CHECK_EQ(GlobalContext().last_used_path(), result->path);
739   GlobalContext().set_explicit_tuning(Tuning::kAuto);
740   GlobalContext().set_max_num_threads(1);
741 }
742 
743 #ifdef RUY_TEST_EXTERNAL_PATHS
744 
745 template <typename Scalar, gemmlowp::MapOrder tOrder>
746 void WrapGemmlowp(const Matrix<Scalar>& src,
747                   gemmlowp::MatrixMap<const Scalar, tOrder>* dst) {
748   RUY_CHECK(src.layout().order() == (tOrder == gemmlowp::MapOrder::ColMajor
749                                          ? Order::kColMajor
750                                          : Order::kRowMajor));
751   *dst = gemmlowp::MatrixMap<const Scalar, tOrder>(
752       src.data(), src.layout().rows(), src.layout().cols(),
753       src.layout().stride());
754 }
755 
756 template <typename Scalar, gemmlowp::MapOrder tOrder>
757 void WrapGemmlowpMutable(Matrix<Scalar>* src,
758                          gemmlowp::MatrixMap<Scalar, tOrder>* dst) {
759   RUY_CHECK(src->layout().order() == (tOrder == gemmlowp::MapOrder::ColMajor
760                                           ? Order::kColMajor
761                                           : Order::kRowMajor));
762   *dst = gemmlowp::MatrixMap<Scalar, tOrder>(src->data(), src->layout().rows(),
763                                              src->layout().cols(),
764                                              src->layout().stride());
765 }
766 
767 template <Order tOrder>
768 struct GemmlowpOrder {};
769 
770 template <>
771 struct GemmlowpOrder<Order::kColMajor> {
772   static constexpr gemmlowp::MapOrder kValue = gemmlowp::MapOrder::ColMajor;
773 };
774 
775 template <>
776 struct GemmlowpOrder<Order::kRowMajor> {
777   static constexpr gemmlowp::MapOrder kValue = gemmlowp::MapOrder::RowMajor;
778 };
779 
780 inline gemmlowp::GemmContext& GlobalGemmlowpContext() {
781   static gemmlowp::GemmContext context;
782   return context;
783 }
784 
785 template <Order LhsOrder, Order RhsOrder, Order DstOrder, typename LhsScalar,
786           typename RhsScalar, typename DstScalar, typename MulParamsType>
787 void EvalGemmlowp(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
788                   const MulParamsType& mul_params, int max_num_threads,
789                   Matrix<DstScalar>* dst) {
790   static constexpr gemmlowp::MapOrder kGemmlowpLhsOrder =
791       GemmlowpOrder<LhsOrder>::kValue;
792   static constexpr gemmlowp::MapOrder kGemmlowpRhsOrder =
793       GemmlowpOrder<RhsOrder>::kValue;
794   static constexpr gemmlowp::MapOrder kGemmlowpDstOrder =
795       GemmlowpOrder<DstOrder>::kValue;
796   gemmlowp::MatrixMap<const LhsScalar, kGemmlowpLhsOrder> gemmlowp_lhs;
797   gemmlowp::MatrixMap<const RhsScalar, kGemmlowpRhsOrder> gemmlowp_rhs;
798   gemmlowp::MatrixMap<DstScalar, kGemmlowpDstOrder> gemmlowp_dst;
799   WrapGemmlowp(lhs, &gemmlowp_lhs);
800   WrapGemmlowp(rhs, &gemmlowp_rhs);
801   WrapGemmlowpMutable(dst, &gemmlowp_dst);
802 
803   gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent quantize_down_stage;
804   quantize_down_stage.result_offset_after_shift = dst->zero_point();
805   quantize_down_stage.result_fixedpoint_multiplier =
806       mul_params.multiplier_fixedpoint();
807   quantize_down_stage.result_exponent = mul_params.multiplier_exponent();
808   gemmlowp::OutputStageScaleInt32ByFixedPointAndExponentPC<
809       gemmlowp::VectorShape::Col>
810       quantize_down_stage_pc;
811   quantize_down_stage_pc.result_offset_after_shift = dst->zero_point();
812   using ColVectorMap =
813       gemmlowp::VectorMap<const std::int32_t, gemmlowp::VectorShape::Col>;
814   quantize_down_stage_pc.result_fixedpoint_multiplier = ColVectorMap(
815       mul_params.multiplier_fixedpoint_perchannel(), lhs.layout().rows());
816   quantize_down_stage_pc.result_exponent = ColVectorMap(
817       mul_params.multiplier_exponent_perchannel(), lhs.layout().rows());
818 
819   gemmlowp::OutputStageClamp clamp_stage;
820   clamp_stage.min = mul_params.clamp_min();
821   clamp_stage.max = mul_params.clamp_max();
822   using OutputStageSaturatingCast = typename std::conditional<
823       std::is_same<DstScalar, std::uint8_t>::value,
824       gemmlowp::OutputStageSaturatingCastToUint8,
825       gemmlowp::OutputStageSaturatingCastToInt16>::type;
826   OutputStageSaturatingCast saturating_cast_stage;
827 
828   GlobalGemmlowpContext().set_max_num_threads(max_num_threads ? max_num_threads
829                                                               : 1);
830   if (mul_params.bias()) {
831     using ColVectorMap =
832         gemmlowp::VectorMap<const std::int32_t, gemmlowp::VectorShape::Col>;
833     gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_add_stage;
834     bias_add_stage.bias_vector =
835         ColVectorMap(mul_params.bias(), dst->layout().rows());
836 #ifndef GEMMLOWP_SSE4  // gemmlowp perchannel stuff does not build on SSE
837     if (mul_params.multiplier_exponent_perchannel()) {
838       const auto& output_pipeline =
839           std::make_tuple(bias_add_stage, quantize_down_stage_pc, clamp_stage,
840                           saturating_cast_stage);
841       gemmlowp::GemmWithOutputPipeline<
842           LhsScalar, DstScalar, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
843           &GlobalGemmlowpContext(), gemmlowp_lhs, gemmlowp_rhs, &gemmlowp_dst,
844           -lhs.zero_point(), -rhs.zero_point(), output_pipeline);
845     } else  // NOLINT[readability/braces]
846 #endif
847     {
848       const auto& output_pipeline =
849           std::make_tuple(bias_add_stage, quantize_down_stage, clamp_stage,
850                           saturating_cast_stage);
851       gemmlowp::GemmWithOutputPipeline<
852           LhsScalar, DstScalar, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
853           &GlobalGemmlowpContext(), gemmlowp_lhs, gemmlowp_rhs, &gemmlowp_dst,
854           -lhs.zero_point(), -rhs.zero_point(), output_pipeline);
855     }
856   } else {
857 #ifndef GEMMLOWP_SSE4  // gemmlowp perchannel stuff does not build on SSE
858     if (mul_params.multiplier_exponent_perchannel()) {
859       const auto& output_pipeline = std::make_tuple(
860           quantize_down_stage_pc, clamp_stage, saturating_cast_stage);
861       gemmlowp::GemmWithOutputPipeline<
862           LhsScalar, DstScalar, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
863           &GlobalGemmlowpContext(), gemmlowp_lhs, gemmlowp_rhs, &gemmlowp_dst,
864           -lhs.zero_point(), -rhs.zero_point(), output_pipeline);
865     } else  // NOLINT[readability/braces]
866 #endif
867     {
868       const auto& output_pipeline = std::make_tuple(
869           quantize_down_stage, clamp_stage, saturating_cast_stage);
870       gemmlowp::GemmWithOutputPipeline<
871           LhsScalar, DstScalar, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
872           &GlobalGemmlowpContext(), gemmlowp_lhs, gemmlowp_rhs, &gemmlowp_dst,
873           -lhs.zero_point(), -rhs.zero_point(), output_pipeline);
874     }
875   }
876 }
877 
878 inline constexpr int Mash(Order LhsOrder, Order RhsOrder, Order DstOrder) {
879   return (LhsOrder == Order::kRowMajor ? 4 : 0) +
880          (RhsOrder == Order::kRowMajor ? 2 : 0) +
881          (DstOrder == Order::kRowMajor ? 1 : 0);
882 }
883 
884 template <typename LhsScalar, typename RhsScalar, typename DstScalar,
885           typename MulParamsType>
886 void EvalGemmlowp(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
887                   const MulParamsType& mul_params, int max_num_threads,
888                   Matrix<DstScalar>* dst) {
889   int index =
890       Mash(lhs.layout().order(), rhs.layout().order(), dst->layout().order());
891   switch (index) {
892 #define EVALGEMMLOWP_CASE3(LHS, RHS, DST)                                     \
893   case Mash(LHS, RHS, DST):                                                   \
894     return EvalGemmlowp<LHS, RHS, DST>(lhs, rhs, mul_params, max_num_threads, \
895                                        dst);
896 #define EVALGEMMLOWP_CASE2(LHS, RHS)             \
897   EVALGEMMLOWP_CASE3(LHS, RHS, Order::kColMajor) \
898   EVALGEMMLOWP_CASE3(LHS, RHS, Order::kRowMajor)
899 #define EVALGEMMLOWP_CASE1(LHS)             \
900   EVALGEMMLOWP_CASE2(LHS, Order::kColMajor) \
901   EVALGEMMLOWP_CASE2(LHS, Order::kRowMajor)
902 
903     EVALGEMMLOWP_CASE1(Order::kColMajor)
904     EVALGEMMLOWP_CASE1(Order::kRowMajor)
905 
906 #undef EVALGEMMLOWP_CASE1
907 #undef EVALGEMMLOWP_CASE2
908 #undef EVALGEMMLOWP_CASE3
909 
910     default:
911       RUY_CHECK(false);
912   }
913 }
914 
915 template <Order tOrder>
916 struct EigenOrder {};
917 
918 template <>
919 struct EigenOrder<Order::kColMajor> {
920   static constexpr int kValue = Eigen::ColMajor;
921 };
922 
923 template <>
924 struct EigenOrder<Order::kRowMajor> {
925   static constexpr int kValue = Eigen::RowMajor;
926 };
927 
928 template <Order LhsOrder, Order RhsOrder, Order DstOrder, typename LhsScalar,
929           typename RhsScalar, typename DstScalar, typename MulParamsType>
930 void EvalEigen(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
931                const MulParamsType& mul_params, int max_num_threads,
932                Matrix<DstScalar>* dst) {
933   RUY_CHECK_EQ(lhs.zero_point(), 0);
934   RUY_CHECK_EQ(rhs.zero_point(), 0);
935   RUY_CHECK_EQ(dst->zero_point(), 0);
936   RUY_CHECK_EQ(mul_params.multiplier_fixedpoint(), 0);
937   RUY_CHECK_EQ(mul_params.multiplier_exponent(), 0);
938 
939   static constexpr int kEigenLhsOrder = EigenOrder<LhsOrder>::kValue;
940   static constexpr int kEigenRhsOrder = EigenOrder<RhsOrder>::kValue;
941   static constexpr int kEigenDstOrder = EigenOrder<DstOrder>::kValue;
942 
943   using EigenLhsType = typename Eigen::Matrix<LhsScalar, Eigen::Dynamic,
944                                               Eigen::Dynamic, kEigenLhsOrder>::
945       template StridedConstMapType<Eigen::OuterStride<Eigen::Dynamic>>::type;
946   using EigenRhsType = typename Eigen::Matrix<RhsScalar, Eigen::Dynamic,
947                                               Eigen::Dynamic, kEigenRhsOrder>::
948       template StridedConstMapType<Eigen::OuterStride<Eigen::Dynamic>>::type;
949   using EigenDstType = typename Eigen::Matrix<DstScalar, Eigen::Dynamic,
950                                               Eigen::Dynamic, kEigenDstOrder>::
951       template StridedMapType<Eigen::OuterStride<Eigen::Dynamic>>::type;
952   using EigenBiasType =
953       typename Eigen::Matrix<DstScalar, Eigen::Dynamic, 1>::ConstMapType;
954 
955   EigenLhsType eigen_lhs(
956       lhs.data(), lhs.layout().rows(), lhs.layout().cols(),
957       Eigen::OuterStride<Eigen::Dynamic>(lhs.layout().stride()));
958   EigenRhsType eigen_rhs(
959       rhs.data(), rhs.layout().rows(), rhs.layout().cols(),
960       Eigen::OuterStride<Eigen::Dynamic>(rhs.layout().stride()));
961   EigenDstType eigen_dst(
962       dst->data(), dst->layout().rows(), dst->layout().cols(),
963       Eigen::OuterStride<Eigen::Dynamic>(dst->layout().stride()));
964   Eigen::setNbThreads(max_num_threads ? max_num_threads : 1);
965 
966   if (mul_params.bias()) {
967     EigenBiasType eigen_bias(mul_params.bias(), dst->layout().rows());
968     if (mul_params.clamp_max() == std::numeric_limits<DstScalar>::infinity() &&
969         mul_params.clamp_min() == -std::numeric_limits<DstScalar>::infinity()) {
970       eigen_dst.noalias() = (eigen_lhs * eigen_rhs).colwise() + eigen_bias;
971     } else {
972       eigen_dst.noalias() = ((eigen_lhs * eigen_rhs).colwise() + eigen_bias)
973                                 .cwiseMin(mul_params.clamp_max())
974                                 .cwiseMax(mul_params.clamp_min());
975     }
976   } else {
977     if (mul_params.clamp_max() == std::numeric_limits<DstScalar>::infinity() &&
978         mul_params.clamp_min() == -std::numeric_limits<DstScalar>::infinity()) {
979       eigen_dst.noalias() = eigen_lhs * eigen_rhs;
980     } else {
981       eigen_dst.noalias() = (eigen_lhs * eigen_rhs)
982                                 .cwiseMin(mul_params.clamp_max())
983                                 .cwiseMax(mul_params.clamp_min());
984     }
985   }
986 }
987 
988 template <typename LhsScalar, typename RhsScalar, typename DstScalar,
989           typename MulParamsType>
990 void EvalEigen(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
991                const MulParamsType& mul_params, int max_num_threads,
992                Matrix<DstScalar>* dst) {
993   int index =
994       Mash(lhs.layout().order(), rhs.layout().order(), dst->layout().order());
995   switch (index) {
996 #define EVALEIGEN_CASE3(LHS, RHS, DST) \
997   case Mash(LHS, RHS, DST):            \
998     return EvalEigen<LHS, RHS, DST>(lhs, rhs, mul_params, max_num_threads, dst);
999 #define EVALEIGEN_CASE2(LHS, RHS)             \
1000   EVALEIGEN_CASE3(LHS, RHS, Order::kColMajor) \
1001   EVALEIGEN_CASE3(LHS, RHS, Order::kRowMajor)
1002 #define EVALEIGEN_CASE1(LHS)             \
1003   EVALEIGEN_CASE2(LHS, Order::kColMajor) \
1004   EVALEIGEN_CASE2(LHS, Order::kRowMajor)
1005 
1006     EVALEIGEN_CASE1(Order::kColMajor)
1007     EVALEIGEN_CASE1(Order::kRowMajor)
1008 
1009 #undef EVALEIGEN_CASE1
1010 #undef EVALEIGEN_CASE2
1011 #undef EVALEIGEN_CASE3
1012 
1013     default:
1014       RUY_CHECK(false);
1015   }
1016 }
1017 
1018 template <Order LhsOrder, Order RhsOrder, Order DstOrder, typename Scalar,
1019           typename MulParamsType>
1020 void EvalEigenTensor(const Matrix<Scalar>& lhs, const Matrix<Scalar>& rhs,
1021                      const MulParamsType& mul_params, int max_num_threads,
1022                      Matrix<Scalar>* dst) {
1023   RUY_CHECK_EQ(lhs.zero_point(), 0);
1024   RUY_CHECK_EQ(rhs.zero_point(), 0);
1025   RUY_CHECK_EQ(dst->zero_point(), 0);
1026   RUY_CHECK_EQ(mul_params.multiplier_fixedpoint(), 0);
1027   RUY_CHECK_EQ(mul_params.multiplier_exponent(), 0);
1028 
1029   // Eigen::TensorMap only supports unstrided layouts
1030   RUY_CHECK(IsUnstrided(lhs.layout()));
1031   RUY_CHECK(IsUnstrided(rhs.layout()));
1032   RUY_CHECK(IsUnstrided(dst->layout()));
1033 
1034   using TensorLhsType =
1035       Eigen::TensorMap<Eigen::Tensor<const Scalar, 2, Eigen::ColMajor>>;
1036   using TensorRhsType =
1037       Eigen::TensorMap<Eigen::Tensor<const Scalar, 2, Eigen::ColMajor>>;
1038   using TensorDstType =
1039       Eigen::TensorMap<Eigen::Tensor<Scalar, 2, Eigen::ColMajor>>;
1040   using TensorBiasType =
1041       Eigen::TensorMap<Eigen::Tensor<const Scalar, 1, Eigen::ColMajor>>;
1042 
1043   const bool tr = DstOrder == Order::kRowMajor;
1044   const auto& contract_lhs = tr ? rhs : lhs;
1045   const auto& contract_rhs = tr ? lhs : rhs;
1046 
1047   TensorLhsType tensor_lhs(
1048       contract_lhs.data(),
1049       LhsOrder == Order::kColMajor ? contract_lhs.layout().rows()
1050                                    : contract_lhs.layout().cols(),
1051       LhsOrder == Order::kColMajor ? contract_lhs.layout().cols()
1052                                    : contract_lhs.layout().rows());
1053   TensorRhsType tensor_rhs(
1054       contract_rhs.data(),
1055       RhsOrder == Order::kColMajor ? contract_rhs.layout().rows()
1056                                    : contract_rhs.layout().cols(),
1057       RhsOrder == Order::kColMajor ? contract_rhs.layout().cols()
1058                                    : contract_rhs.layout().rows());
1059   TensorDstType tensor_dst(dst->data(),
1060                            DstOrder == Order::kColMajor ? dst->layout().rows()
1061                                                         : dst->layout().cols(),
1062                            DstOrder == Order::kColMajor ? dst->layout().cols()
1063                                                         : dst->layout().rows());
1064   using DimPair =
1065       typename Eigen::Tensor<Scalar, 1, 0, Eigen::Index>::DimensionPair;
1066   Eigen::array<DimPair, 1> contract_dims{
1067       {DimPair((LhsOrder == Order::kColMajor) ? 1 : 0,
1068                (RhsOrder == Order::kColMajor) ? 0 : 1)}};
1069   static Eigen::ThreadPool pool(max_num_threads ? max_num_threads : 1);
1070   static Eigen::ThreadPoolDevice device(&pool, pool.NumThreads());
1071   if (mul_params.bias()) {
1072     TensorBiasType tensor_bias(mul_params.bias(), dst->layout().rows());
1073     Eigen::array<int, 2> bias_2d_shape{tr ? 1 : dst->layout().rows(),
1074                                        tr ? dst->layout().rows() : 1};
1075     Eigen::array<int, 2> bcast{tr ? dst->layout().cols() : 1,
1076                                tr ? 1 : dst->layout().cols()};
1077     if (mul_params.clamp_max() == std::numeric_limits<Scalar>::infinity() &&
1078         mul_params.clamp_min() == -std::numeric_limits<Scalar>::infinity()) {
1079       tensor_dst.device(device) =
1080           tensor_lhs.contract(tensor_rhs, contract_dims);
1081     } else {
1082       tensor_dst.device(device) =
1083           (tensor_lhs.contract(tensor_rhs, contract_dims) +
1084            tensor_bias.reshape(bias_2d_shape).broadcast(bcast))
1085               .cwiseMin(mul_params.clamp_max())
1086               .cwiseMax(mul_params.clamp_min());
1087     }
1088   } else {
1089     if (mul_params.clamp_max() == std::numeric_limits<Scalar>::infinity() &&
1090         mul_params.clamp_min() == -std::numeric_limits<Scalar>::infinity()) {
1091       tensor_dst.device(device) =
1092           tensor_lhs.contract(tensor_rhs, contract_dims);
1093     } else {
1094       tensor_dst.device(device) = tensor_lhs.contract(tensor_rhs, contract_dims)
1095                                       .cwiseMin(mul_params.clamp_max())
1096                                       .cwiseMax(mul_params.clamp_min());
1097     }
1098   }
1099 }
1100 
1101 template <typename Scalar, typename MulParamsType>
1102 void EvalEigenTensor(const Matrix<Scalar>& lhs, const Matrix<Scalar>& rhs,
1103                      const MulParamsType& mul_params, int max_num_threads,
1104                      Matrix<Scalar>* dst) {
1105   int index =
1106       Mash(lhs.layout().order(), rhs.layout().order(), dst->layout().order());
1107   switch (index) {
1108 #define EVALEIGENTENSOR_CASE3(LHS, RHS, DST)                    \
1109   case Mash(LHS, RHS, DST):                                     \
1110     return EvalEigenTensor<LHS, RHS, DST>(lhs, rhs, mul_params, \
1111                                           max_num_threads, dst);
1112 #define EVALEIGENTENSOR_CASE2(LHS, RHS)             \
1113   EVALEIGENTENSOR_CASE3(LHS, RHS, Order::kColMajor) \
1114   EVALEIGENTENSOR_CASE3(LHS, RHS, Order::kRowMajor)
1115 #define EVALEIGENTENSOR_CASE1(LHS)             \
1116   EVALEIGENTENSOR_CASE2(LHS, Order::kColMajor) \
1117   EVALEIGENTENSOR_CASE2(LHS, Order::kRowMajor)
1118 
1119     EVALEIGENTENSOR_CASE1(Order::kColMajor)
1120     EVALEIGENTENSOR_CASE1(Order::kRowMajor)
1121 
1122 #undef EVALEIGENTENSOR_CASE1
1123 #undef EVALEIGENTENSOR_CASE2
1124 #undef EVALEIGENTENSOR_CASE3
1125 
1126     default:
1127       RUY_CHECK(false);
1128   }
1129 }
1130 
1131 template <typename Scalar>
1132 struct GenericBlasGemm {};
1133 
1134 template <>
1135 struct GenericBlasGemm<lapack::doublereal> {
1136   static void Run(char* transa, char* transb, lapack::integer* m,
1137                   lapack::integer* n, lapack::integer* k,
1138                   lapack::doublereal* alpha, lapack::doublereal* a,
1139                   lapack::integer* lda, lapack::doublereal* b,
1140                   lapack::integer* ldb, lapack::doublereal* beta,
1141                   lapack::doublereal* c, lapack::integer* ldc) {
1142     dgemm_(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
1143   }
1144 };
1145 
1146 template <>
1147 struct GenericBlasGemm<lapack::real> {
1148   static void Run(char* transa, char* transb, lapack::integer* m,
1149                   lapack::integer* n, lapack::integer* k, lapack::real* alpha,
1150                   lapack::real* a, lapack::integer* lda, lapack::real* b,
1151                   lapack::integer* ldb, lapack::real* beta, lapack::real* c,
1152                   lapack::integer* ldc) {
1153     sgemm_(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
1154   }
1155 };
1156 
1157 inline void TransposeLayout(Layout* layout) {
1158   layout->set_order((layout->order() == Order::kRowMajor) ? Order::kColMajor
1159                                                           : Order::kRowMajor);
1160   int tmp_rows = layout->rows();
1161   layout->set_rows(layout->cols());
1162   layout->set_cols(tmp_rows);
1163 }
1164 
1165 template <typename Scalar>
1166 void Transpose(Matrix<Scalar>* matrix) {
1167   TransposeLayout(matrix->mutable_layout());
1168 }
1169 
1170 template <typename Scalar, typename MulParamsType>
1171 void EvalOpenBlas(const Matrix<Scalar>& lhs, const Matrix<Scalar>& rhs,
1172                   const MulParamsType& mul_params, int max_num_threads,
1173                   Matrix<Scalar>* dst) {
1174   RUY_CHECK_EQ(lhs.zero_point(), 0);
1175   RUY_CHECK_EQ(rhs.zero_point(), 0);
1176   RUY_CHECK_EQ(dst->zero_point(), 0);
1177   RUY_CHECK_EQ(mul_params.multiplier_fixedpoint(), 0);
1178   RUY_CHECK_EQ(mul_params.multiplier_exponent(), 0);
1179 
1180   Matrix<Scalar> gemm_lhs;
1181   Matrix<Scalar> gemm_rhs;
1182   Matrix<Scalar> gemm_dst;
1183   gemm_dst = *dst;
1184 
1185   // Use Transpose to reduce to the all-column-major case.
1186   // Notice that ruy::Matrix merely holds a pointer, does not own data,
1187   // so Transpose is cheap -- no actual matrix data is being transposed here.
1188   if (dst->layout().order() == Order::kColMajor) {
1189     gemm_lhs = lhs;
1190     gemm_rhs = rhs;
1191   } else {
1192     gemm_lhs = rhs;
1193     gemm_rhs = lhs;
1194     Transpose(&gemm_lhs);
1195     Transpose(&gemm_rhs);
1196     Transpose(&gemm_dst);
1197   }
1198   bool transposed_lhs = false;
1199   bool transposed_rhs = false;
1200 
1201   if (gemm_lhs.layout().order() == Order::kRowMajor) {
1202     Transpose(&gemm_lhs);
1203     transposed_lhs = true;
1204   }
1205   if (gemm_rhs.layout().order() == Order::kRowMajor) {
1206     Transpose(&gemm_rhs);
1207     transposed_rhs = true;
1208   }
1209 
1210   RUY_CHECK_EQ(gemm_lhs.layout().order(), Order::kColMajor);
1211   RUY_CHECK_EQ(gemm_rhs.layout().order(), Order::kColMajor);
1212   RUY_CHECK_EQ(gemm_dst.layout().order(), Order::kColMajor);
1213 
1214   char transa = transposed_lhs ? 'T' : 'N';
1215   char transb = transposed_rhs ? 'T' : 'N';
1216   int m = gemm_lhs.layout().rows();
1217   int n = gemm_rhs.layout().cols();
1218   int k = gemm_lhs.layout().cols();
1219   float alpha = 1;
1220   Scalar* a = gemm_lhs.data();
1221   int lda = gemm_lhs.layout().stride();
1222   Scalar* b = gemm_rhs.data();
1223   int ldb = gemm_rhs.layout().stride();
1224   float beta = 0;
1225   Scalar* c = gemm_dst.data();
1226   int ldc = gemm_dst.layout().stride();
1227   GenericBlasGemm<Scalar>::Run(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b,
1228                                &ldb, &beta, c, &ldc);
1229 
1230   // BLAS does not allow us to express the bias-addition and clamping, so
1231   // we use Eigen for that.
1232 
1233   using EigenDstType =
1234       typename Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>::
1235           template StridedMapType<Eigen::OuterStride<Eigen::Dynamic>>::type;
1236   using EigenBiasType =
1237       typename Eigen::Matrix<Scalar, Eigen::Dynamic, 1>::ConstMapType;
1238 
1239   EigenDstType eigen_dst(
1240       gemm_dst.data(), gemm_dst.layout().rows(), gemm_dst.layout().cols(),
1241       Eigen::OuterStride<Eigen::Dynamic>(gemm_dst.layout().stride()));
1242   Eigen::setNbThreads(max_num_threads ? max_num_threads : 1);
1243 
1244   if (mul_params.bias()) {
1245     EigenBiasType eigen_bias(mul_params.bias(), dst->layout().rows());
1246     if (mul_params.clamp_max() == std::numeric_limits<Scalar>::infinity() &&
1247         mul_params.clamp_min() == -std::numeric_limits<Scalar>::infinity()) {
1248       eigen_dst.noalias() = eigen_dst.colwise() + eigen_bias;
1249     } else {
1250       eigen_dst.noalias() = (eigen_dst.colwise() + eigen_bias)
1251                                 .cwiseMin(mul_params.clamp_max())
1252                                 .cwiseMax(mul_params.clamp_min());
1253     }
1254   } else {
1255     if (mul_params.clamp_max() == std::numeric_limits<Scalar>::infinity() &&
1256         mul_params.clamp_min() == -std::numeric_limits<Scalar>::infinity()) {
1257     } else {
1258       eigen_dst.noalias() = eigen_dst.cwiseMin(mul_params.clamp_max())
1259                                 .cwiseMax(mul_params.clamp_min());
1260     }
1261   }
1262 }
1263 
1264 template <typename TestSetType>
1265 struct SupportsGemmlowp {
1266   static constexpr bool kValue =
1267       std::is_same<typename TestSetType::LhsScalar, std::uint8_t>::value &&
1268       std::is_same<typename TestSetType::RhsScalar, std::uint8_t>::value;
1269 };
1270 
1271 template <typename TestSetType>
1272 struct UsesSingleScalarType {
1273   static constexpr bool kValue =
1274       std::is_same<typename TestSetType::DstScalar,
1275                    typename TestSetType::LhsScalar>::value &&
1276       std::is_same<typename TestSetType::DstScalar,
1277                    typename TestSetType::RhsScalar>::value &&
1278       std::is_same<typename TestSetType::DstScalar,
1279                    typename TestSetType::AccumScalar>::value;
1280 };
1281 
1282 template <typename TestSetType,
1283           bool IsFloatingPoint =
1284               std::is_floating_point<typename TestSetType::AccumScalar>::value,
1285           bool EnableGemmlowp = SupportsGemmlowp<TestSetType>::kValue,
1286           bool SingleScalarType = UsesSingleScalarType<TestSetType>::kValue>
1287 struct EvalExternalPathImpl {
1288   using DstScalar = typename TestSetType::DstScalar;
1289   static void Run(TestSetType*, TestResult<DstScalar>*) { RUY_CHECK(false); }
1290 };
1291 
1292 template <typename TestSetType>
1293 struct EvalExternalPathImpl<TestSetType, true, false, true> {
1294   using DstScalar = typename TestSetType::DstScalar;
1295   static void Run(TestSetType* test_set, TestResult<DstScalar>* test_result) {
1296     if (test_result->external_path == ExternalPath::kEigen) {
1297       EvalEigen(test_set->lhs.matrix, test_set->rhs.matrix,
1298                 test_set->mul_params, test_set->max_num_threads,
1299                 &test_result->storage_matrix.matrix);
1300     } else if (test_result->external_path == ExternalPath::kEigenTensor) {
1301       EvalEigenTensor(test_set->lhs.matrix, test_set->rhs.matrix,
1302                       test_set->mul_params, test_set->max_num_threads,
1303                       &test_result->storage_matrix.matrix);
1304     } else if (test_result->external_path == ExternalPath::kOpenBlas) {
1305       EvalOpenBlas(test_set->lhs.matrix, test_set->rhs.matrix,
1306                    test_set->mul_params, test_set->max_num_threads,
1307                    &test_result->storage_matrix.matrix);
1308     } else {
1309       RUY_CHECK(false);
1310     }
1311   }
1312 };
1313 
1314 template <typename TestSetType, bool SingleScalarType>
1315 struct EvalExternalPathImpl<TestSetType, false, true, SingleScalarType> {
1316   using DstScalar = typename TestSetType::DstScalar;
1317   static void Run(TestSetType* test_set, TestResult<DstScalar>* test_result) {
1318     if (test_result->external_path == ExternalPath::kGemmlowp) {
1319       EvalGemmlowp(test_set->lhs.matrix, test_set->rhs.matrix,
1320                    test_set->mul_params, test_set->max_num_threads,
1321                    &test_result->storage_matrix.matrix);
1322     } else {
1323       RUY_CHECK(false);
1324     }
1325   }
1326 };
1327 
1328 #endif  // RUY_TEST_EXTERNAL_PATHS
1329 
1330 template <typename TestSetType>
1331 void EvalExternalPath(
1332     TestSetType* test_set,
1333     TestResult<typename TestSetType::DstScalar>* test_result) {
1334   if (test_result->external_path == ExternalPath::kReference) {
1335     // kReference is special because it's always available (the implementation
1336     // is provided by ruy) and supports all cases (quantized and float).
1337     ruy::ReferenceMul(test_set->lhs.matrix, test_set->rhs.matrix,
1338                       test_set->mul_params,
1339                       &test_result->storage_matrix.matrix);
1340   } else {
1341 #ifdef RUY_TEST_EXTERNAL_PATHS
1342     EvalExternalPathImpl<TestSetType>::Run(test_set, test_result);
1343 #endif  // RUY_TEST_EXTERNAL_PATHS
1344   }
1345 }
1346 
1347 template <typename Scalar>
1348 bool Agree(ExternalPath external_path1, const Matrix<Scalar>& matrix1,
1349            ExternalPath external_path2, const Matrix<Scalar>& matrix2,
1350            int depth) {
1351   RUY_CHECK_EQ(matrix1.layout().rows(), matrix2.layout().rows());
1352   RUY_CHECK_EQ(matrix1.layout().cols(), matrix2.layout().cols());
1353   RUY_CHECK_EQ(matrix1.zero_point(), matrix2.zero_point());
1354   const int size = matrix1.layout().rows() * matrix1.layout().cols();
1355   double tolerated_max_diff = 0;
1356   double tolerated_mean_diff = 0;
1357   const float kSmallestAllowedDifference =
1358       4. * std::numeric_limits<Scalar>::epsilon();
1359   if (std::is_floating_point<Scalar>::value) {
1360     double max_abs_val = 0;
1361     for (int row = 0; row < matrix1.layout().rows(); row++) {
1362       for (int col = 0; col < matrix1.layout().cols(); col++) {
1363         max_abs_val =
1364             std::max(max_abs_val,
1365                      std::abs(static_cast<double>(Element(matrix1, row, col))));
1366         max_abs_val =
1367             std::max(max_abs_val,
1368                      std::abs(static_cast<double>(Element(matrix2, row, col))));
1369       }
1370     }
1371     tolerated_max_diff = max_abs_val * std::numeric_limits<Scalar>::epsilon() *
1372                          64 * std::sqrt(static_cast<float>(depth));
1373     if (tolerated_max_diff < kSmallestAllowedDifference) {
1374       // Clamp the tolerated max diff to be a bit above machine epsilon if the
1375       // calculated value is too small.
1376       tolerated_max_diff = kSmallestAllowedDifference;
1377       if (external_path1 == ExternalPath::kEigen ||
1378           external_path2 == ExternalPath::kEigen ||
1379           external_path1 == ExternalPath::kEigenTensor ||
1380           external_path2 == ExternalPath::kEigenTensor) {
1381         // Make additional allowance for Eigen differences.
1382         tolerated_max_diff *= 10.0f;
1383       }
1384     }
1385     tolerated_mean_diff = tolerated_max_diff / std::sqrt(size);
1386   } else if (std::is_same<Scalar, std::int32_t>::value) {
1387     // raw integer case, no rounding, so we can require exactness
1388     tolerated_max_diff = 0;
1389     tolerated_mean_diff = 0;
1390   } else {
1391     // quantized case, with rounding errors in downscaling int32 accumulators
1392     // to final 8bit or 16bit values.
1393 
1394     if (external_path1 != ExternalPath::kNone ||
1395         external_path2 != ExternalPath::kNone) {
1396       // In this case, we are comparing against some other library than ruy.
1397       //
1398       // We may have to tolerate an error of +/- 1 from using different
1399       // rounding in fixed-point multiplication, and then again an error of +/-
1400       // 1 from using different rounding in right shifts, so the tolerance on
1401       // the difference may have to be as large as 2.
1402       tolerated_max_diff = 2;
1403     } else if (RUY_PLATFORM_ARM) {
1404       // All our code paths on ARM (32 and 64-bit) are bit-exact
1405       // with the reference code (by design of the reference code).
1406       tolerated_max_diff = 0;
1407     } else if (RUY_PLATFORM_X86) {
1408       // Our reference and ARM paths have diverged from x86 paths in PR #227.
1409       // TODO: update the x86 path to adapt to that and reset that tolerance
1410       // to 0.
1411       tolerated_max_diff = 1;
1412     } else {
1413       // Other architectures, which we don't have dedicated code paths for
1414       // at the moment.
1415       // TODO: try resetting that tolerance to 0, since by
1416       // definition we're only using non-optimized code paths here.
1417       tolerated_max_diff = 1;
1418     }
1419 
1420     // totally empirical
1421     tolerated_mean_diff = std::min(1.0, 2.0 * std::pow(size, -0.18));
1422   }
1423   double sum_diff = 0;
1424   for (int row = 0; row < matrix1.layout().rows(); row++) {
1425     for (int col = 0; col < matrix1.layout().cols(); col++) {
1426       double elem1 = Element(matrix1, row, col);
1427       double elem2 = Element(matrix2, row, col);
1428       double diff = elem1 - elem2;
1429 
1430       sum_diff += diff;
1431       // Test (std::abs(diff) > tolerated_max_diff), but also true if diff is
1432       // NaN.
1433       if (!(std::abs(diff) <= tolerated_max_diff)) {
1434         return false;
1435       }
1436     }
1437   }
1438   double mean_diff = sum_diff / size;
1439   if (std::abs(mean_diff) > tolerated_mean_diff) {
1440     return false;
1441   }
1442   return true;
1443 }
1444 
1445 template <typename Scalar>
1446 bool Agree(ExternalPath external_path1,
1447            const StorageMatrix<Scalar>& storage_matrix1,
1448            ExternalPath external_path2,
1449            const StorageMatrix<Scalar>& storage_matrix2, int depth) {
1450   VerifyConsistentFields(storage_matrix1);
1451   VerifyConsistentFields(storage_matrix2);
1452   return Agree(external_path1, storage_matrix1.matrix, external_path2,
1453                storage_matrix2.matrix, depth);
1454 }
1455 
1456 template <typename Scalar>
1457 bool Agree(const TestResult<Scalar>& result1, const TestResult<Scalar>& result2,
1458            int depth) {
1459   return Agree(result1.external_path, result1.storage_matrix,
1460                result2.external_path, result2.storage_matrix, depth);
1461 }
1462 
1463 template <typename Scalar>
1464 void AddTestResultToCluster(
1465     TestResult<Scalar>** result,
1466     std::vector<std::vector<TestResult<Scalar>*>>& clusters, int depth) {
1467   bool inserted = false;
1468   for (auto& cluster : clusters) {
1469     bool agreement = true;
1470     // Test for agreement with every result in the cluster.
1471     for (auto& other_result : cluster) {
1472       agreement &= Agree(**result, *other_result, depth);
1473     }
1474     if (agreement) {
1475       cluster.push_back(*result);
1476       inserted = true;
1477     }
1478   }
1479   if (!inserted) {
1480     std::vector<TestResult<Scalar>*> new_results;
1481     new_results.push_back(*result);
1482     clusters.push_back(new_results);
1483   }
1484 }
1485 
1486 template <typename Scalar>
1487 void PrintPathsInAgreement(
1488     const std::vector<std::unique_ptr<TestResult<Scalar>>>& results,
1489     int depth) {
1490   // A container holding vectors of TestResults, where membership indicates
1491   // that all TestResults agree with each other.
1492   std::vector<std::vector<TestResult<Scalar>*>> clusters;
1493   for (const auto& result : results) {
1494     TestResult<Scalar>* test_result = result.get();
1495     AddTestResultToCluster(&test_result, clusters, depth);
1496   }
1497 
1498   std::cerr << "Error: Not all paths agree. \n";
1499   for (auto& cluster : clusters) {
1500     std::cerr << "These paths all agree with each other: ";
1501     for (auto& result : cluster) {
1502       std::cerr << PathName(*result) << ", ";
1503     }
1504     std::cerr << "but disagree with the rest.\n";
1505   }
1506 }
1507 
1508 struct Stats {
1509   double median;
1510   double mean;
1511   double min;
1512   double max;
1513 };
1514 
1515 inline std::string StatsAsString(const Stats& stats) {
1516   char buf[256];
1517   snprintf(buf, sizeof(buf), "(median = %g, mean = %g, min = %g, max = %g)",
1518            stats.median, stats.mean, stats.min, stats.max);
1519   return std::string(buf);
1520 }
1521 
1522 template <typename Scalar>
1523 void GetMatrixStats(const Matrix<Scalar>& matrix, Stats* stats) {
1524   double min = std::numeric_limits<double>::infinity();
1525   double max = -std::numeric_limits<double>::infinity();
1526   double sum = 0;
1527   std::vector<double> allvals;
1528   for (int row = 0; row < matrix.layout().rows(); row++) {
1529     for (int col = 0; col < matrix.layout().cols(); col++) {
1530       double val = Element(matrix, row, col);
1531       min = std::min(min, val);
1532       max = std::max(max, val);
1533       sum += val;
1534       allvals.push_back(val);
1535     }
1536   }
1537   std::sort(allvals.begin(), allvals.end());
1538   stats->min = min;
1539   stats->max = max;
1540   stats->mean = sum / allvals.size();
1541   stats->median = allvals[allvals.size() / 2];
1542 }
1543 
1544 struct ErrorAnalysis {
1545   Stats stats_good;
1546   Stats stats_bad;
1547   // The below is to help document departure from bit exactness. It's probably
1548   // not going to be relevant to floating-point.
1549   std::set<int> error_rows;
1550   std::set<int> error_cols;
1551   int row_of_first_error = 0;
1552   int col_of_first_error = 0;
1553   double first_error_good_value = 0;
1554   double first_error_bad_value = 0;
1555 };
1556 
1557 template <typename TestSetType>
1558 void AnalyzeTestError(const TestSetType& test_set, int first_bad_result_index,
1559                       ErrorAnalysis* error_analysis) {
1560   const auto& good_matrix = test_set.results[0]->storage_matrix.matrix;
1561   const auto& bad_matrix =
1562       test_set.results[first_bad_result_index]->storage_matrix.matrix;
1563   GetMatrixStats(good_matrix, &error_analysis->stats_good);
1564   GetMatrixStats(bad_matrix, &error_analysis->stats_bad);
1565   bool found_first_error = false;
1566   for (int row = 0; row < good_matrix.layout().rows(); row++) {
1567     for (int col = 0; col < good_matrix.layout().cols(); col++) {
1568       if (Element(good_matrix, row, col) != Element(bad_matrix, row, col)) {
1569         if (!found_first_error) {
1570           found_first_error = true;
1571           error_analysis->row_of_first_error = row;
1572           error_analysis->col_of_first_error = col;
1573           error_analysis->first_error_good_value =
1574               Element(good_matrix, row, col);
1575           error_analysis->first_error_bad_value = Element(bad_matrix, row, col);
1576         }
1577         error_analysis->error_rows.insert(row);
1578         error_analysis->error_cols.insert(col);
1579       }
1580     }
1581   }
1582 }
1583 
1584 template <typename TestSetType>
1585 void ComputeReasonableMultiplier(
1586     const Matrix<typename TestSetType::LhsScalar>& lhs,
1587     const Matrix<typename TestSetType::RhsScalar>&, double* multiplier) {
1588   using LhsScalar = typename TestSetType::LhsScalar;
1589   using RhsScalar = typename TestSetType::RhsScalar;
1590   using DstScalar = typename TestSetType::DstScalar;
1591   if (std::is_floating_point<DstScalar>::value ||
1592       std::is_same<DstScalar, std::int32_t>::value) {
1593     *multiplier = 0;
1594     return;
1595   }
1596   *multiplier = static_cast<double>(std::numeric_limits<DstScalar>::max()) /
1597                 (static_cast<double>(lhs.layout().cols()) *
1598                  std::numeric_limits<LhsScalar>::max() *
1599                  std::numeric_limits<RhsScalar>::max());
1600 }
1601 
1602 inline void QuantizeMultiplier(double multiplier_double,
1603                                std::int32_t* multiplier_fixedpoint,
1604                                int* multiplier_exponent) {
1605   RUY_CHECK_GT(multiplier_double, 0);
1606   if (multiplier_double == 0.) {
1607     *multiplier_fixedpoint = 0;
1608     *multiplier_exponent = 0;
1609     return;
1610   }
1611   const double q = std::frexp(multiplier_double, multiplier_exponent);
1612   auto q_fixed = static_cast<std::int64_t>(std::round(q * (1ll << 31)));
1613   RUY_CHECK_LE(q_fixed, (1ll << 31));
1614   if (q_fixed == (1ll << 31)) {
1615     q_fixed /= 2;
1616     ++*multiplier_exponent;
1617   }
1618   RUY_CHECK_LE(q_fixed, std::numeric_limits<std::int32_t>::max());
1619   *multiplier_fixedpoint = static_cast<std::int32_t>(q_fixed);
1620 }
1621 
1622 template <typename TestSetType>
1623 void SwitchMultiplierToPerChannel(TestSetType* test_set) {
1624   ChannelDimension channel_dimension =
1625       (test_set->benchmark || global_random_engine()() % 2)
1626           ? ChannelDimension::kRow
1627           : ChannelDimension::kCol;
1628   test_set->mul_params.set_channel_dimension(channel_dimension);
1629   const int num_channels = channel_dimension == ChannelDimension::kRow
1630                                ? test_set->rows
1631                                : test_set->cols;
1632   test_set->per_channel_multiplier_fixedpoint.resize(num_channels);
1633   test_set->per_channel_multiplier_exponent.resize(num_channels);
1634   for (int i = 0; i < num_channels; i++) {
1635     // multipliers typically range in [2^30 ; 2^31 - 1].
1636     // Values in [0, 2^30 - 1] are normally unused, but harmless.
1637     // Thus a good way to randomize multipliers is to subtract from them
1638     // a random value smaller than 2^30 but still significant compared to it.
1639     std::int32_t nudged_multiplier =
1640         test_set->mul_params.multiplier_fixedpoint() -
1641         (global_random_engine()() % (1 << 26));
1642     int nudged_exponent = test_set->mul_params.multiplier_exponent() - 1 +
1643                           (global_random_engine()() % 4);
1644     test_set->per_channel_multiplier_fixedpoint[i] = nudged_multiplier;
1645     test_set->per_channel_multiplier_exponent[i] = nudged_exponent;
1646   }
1647   test_set->mul_params.set_multiplier_fixedpoint(0);
1648   test_set->mul_params.set_multiplier_exponent(0);
1649   test_set->mul_params.set_multiplier_fixedpoint_perchannel(
1650       test_set->per_channel_multiplier_fixedpoint.data());
1651   test_set->mul_params.set_multiplier_exponent_perchannel(
1652       test_set->per_channel_multiplier_exponent.data());
1653 }
1654 
1655 template <
1656     typename TestSetType,
1657     bool IsApplicable =
1658         std::is_same<typename TestSetType::AccumScalar, std::int32_t>::value &&
1659         !std::is_same<typename TestSetType::DstScalar, std::int32_t>::value>
1660 struct MakeSpecMultiplierFieldsImpl {};
1661 
1662 template <typename TestSetType>
1663 struct MakeSpecMultiplierFieldsImpl<TestSetType, true> {
1664   static void Run(TestSetType* test_set) {
1665     double multiplier;
1666     ComputeReasonableMultiplier<TestSetType>(test_set->lhs.matrix,
1667                                              test_set->rhs.matrix, &multiplier);
1668     typename TestSetType::AccumScalar multiplier_fixedpoint;
1669     int multiplier_exponent;
1670     QuantizeMultiplier(multiplier, &multiplier_fixedpoint,
1671                        &multiplier_exponent);
1672     test_set->mul_params.set_multiplier_fixedpoint(multiplier_fixedpoint);
1673     test_set->mul_params.set_multiplier_exponent(multiplier_exponent);
1674 
1675     if (!test_set->benchmark) {
1676       test_set->perchannel = global_random_engine()() & 1;
1677     }
1678     if (test_set->perchannel) {
1679       SwitchMultiplierToPerChannel(test_set);
1680     }
1681   }
1682 };
1683 
1684 template <typename TestSetType>
1685 struct MakeSpecMultiplierFieldsImpl<TestSetType, false> {
1686   static void Run(TestSetType*) {}
1687 };
1688 
1689 template <typename MulParamsType>
1690 void MakeSpecClampFields(MulParamsType* mul_params) {
1691   using DstScalar = typename MulParamsType::DstScalar;
1692 
1693   if (getenv("BENCHMARK_ONLY_MATMUL")) {
1694     if (std::is_floating_point<DstScalar>::value) {
1695       mul_params->set_clamp_min(-std::numeric_limits<DstScalar>::infinity());
1696       mul_params->set_clamp_max(std::numeric_limits<DstScalar>::infinity());
1697     } else {
1698       mul_params->set_clamp_min(std::numeric_limits<DstScalar>::lowest());
1699       mul_params->set_clamp_max(std::numeric_limits<DstScalar>::max());
1700     }
1701     return;
1702   }
1703 
1704   mul_params->set_clamp_min(std::numeric_limits<DstScalar>::lowest() + 1);
1705   mul_params->set_clamp_max(std::numeric_limits<DstScalar>::max() - 1);
1706 }
1707 
1708 void MakeSpecClampFields(MulParams<std::int32_t, std::int32_t>*) {
1709   // Returning raw accumulators, clamping is not supported.
1710 }
1711 
1712 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
1713           typename DstScalar>
1714 void TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::MakeZeroPoints() {
1715   RUY_CHECK_EQ(life_stage, LifeStage::kInitial);
1716   if (std::is_same<LhsScalar, std::int16_t>::value ||
1717       std::is_same<RhsScalar, std::int16_t>::value) {
1718     // For now, support for int16 source types is limited to the
1719     // symmetric case (zero_point==0) because that appears to be
1720     // the case in the initial use cases, and that limits complexity
1721     // in thinking about accumulator overflows.
1722     // Setting use_specified_zero_points causes the default values 0 to be
1723     // used unless explicitly overridden.
1724     use_specified_zero_points = true;
1725   }
1726   if (!benchmark && !use_specified_zero_points) {
1727     MakeRandomScalar(RandomRange::kReasonableSrcZeroPoint, &lhs_zero_point);
1728     MakeRandomScalar(RandomRange::kReasonableSrcZeroPoint, &rhs_zero_point);
1729     // If destination is std::int32_t, no dst_zero_point is necessary.
1730     if (std::is_same<DstScalar, std::int32_t>::value) {
1731       dst_zero_point = 0;
1732     } else {
1733       MakeRandomScalar(RandomRange::kReasonableDstZeroPoint, &dst_zero_point);
1734     }
1735   }
1736   life_stage = LifeStage::kHasZeroPoints;
1737 }
1738 
1739 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
1740           typename DstScalar>
1741 void TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::MakeLhsRhs() {
1742   RUY_CHECK_EQ(life_stage, LifeStage::kHasZeroPoints);
1743   MakeRandom(rows, depth, lhs_order, lhs_zero_point, layout_style,
1744              RandomRange::kOffCenterAvoidMinValue, &lhs);
1745   MakeRandom(depth, cols, rhs_order, rhs_zero_point, layout_style,
1746              RandomRange::kGeneral, &rhs);
1747   if (!benchmark) {
1748     cache_lhs = (global_random_engine()() & 0xf) == 0;
1749     cache_rhs = (global_random_engine()() & 0xf) == 0;
1750   }
1751   if (cache_lhs) {
1752     lhs.matrix.set_cache_policy(CachePolicy::kAlwaysCache);
1753   }
1754   if (cache_rhs) {
1755     rhs.matrix.set_cache_policy(CachePolicy::kAlwaysCache);
1756   }
1757   life_stage = LifeStage::kHasLhsRhs;
1758 }
1759 
1760 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
1761           typename DstScalar>
1762 void TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::MakeMulParams() {
1763   RUY_CHECK_EQ(life_stage, LifeStage::kHasLhsRhs);
1764 
1765   if (!getenv("BENCHMARK_ONLY_MATMUL") &&
1766       (benchmark || (global_random_engine()() & 1))) {
1767     MakeRandomVector(RandomRange::kBias, std::max(rows, cols), &bias_data);
1768     mul_params.set_bias(bias_data.data());
1769   }
1770   if (lhs.matrix.zero_point() == std::numeric_limits<LhsScalar>::lowest() &&
1771       rhs.matrix.zero_point() == std::numeric_limits<RhsScalar>::lowest()) {
1772     lhs.matrix.set_zero_point(lhs.matrix.zero_point() + 1);
1773   }
1774   MakeSpecMultiplierFieldsImpl<TestSet>::Run(this);
1775   MakeSpecClampFields(&mul_params);
1776   life_stage = LifeStage::kHasMulParams;
1777 }
1778 
1779 inline int GetIntEnvVarOrZero(const char* name) {
1780   const char* val = getenv(name);
1781   if (!val) {
1782     return 0;
1783   }
1784   return std::stoi(val);
1785 }
1786 
1787 inline float GetFloatEnvVarOrZero(const char* name) {
1788   const char* val = getenv(name);
1789   if (!val) {
1790     return 0;
1791   }
1792   return std::stof(val);
1793 }
1794 
1795 inline int GetHexIntEnvVarOrZero(const char* name) {
1796   const char* val = getenv(name);
1797   if (!val) {
1798     return 0;
1799   }
1800   return std::stoi(val, nullptr, 16);
1801 }
1802 
1803 inline bool GetBoolEnvVarOrFalse(const char* name) {
1804   return static_cast<bool>(GetIntEnvVarOrZero(name));
1805 }
1806 
1807 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
1808           typename DstScalar>
1809 void TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::MakeOtherParams() {
1810   RUY_CHECK_EQ(life_stage, LifeStage::kHasMulParams);
1811   if (max_num_threads == 0) {
1812     max_num_threads = GetIntEnvVarOrZero("THREADS");
1813   }
1814   life_stage = LifeStage::kHasOtherParams;
1815 }
1816 
1817 inline std::vector<Path> PathsBitfieldAsVector(Path paths_bitfield) {
1818   std::vector<Path> result;
1819   std::uint32_t remaining_paths = static_cast<std::uint32_t>(paths_bitfield);
1820   std::uint32_t test_bit = 1;
1821   while (remaining_paths) {
1822     if (remaining_paths & test_bit) {
1823       result.push_back(static_cast<Path>(test_bit));
1824     }
1825     remaining_paths &= ~test_bit;
1826     test_bit <<= 1;
1827   }
1828   return result;
1829 }
1830 
1831 inline std::vector<Tuning> EnumerateTuningsForPath(Path path, bool benchmark) {
1832   if (benchmark) {
1833     return {Tuning::kAuto};
1834   }
1835 #if RUY_PLATFORM_ARM
1836   if (path == Path::kNeon || path == Path::kNeonDotprod) {
1837     return {Tuning::kA55ish, Tuning::kX1, Tuning::kGeneric, Tuning::kAuto};
1838   }
1839 #endif
1840   (void)path;
1841   return {Tuning::kAuto};
1842 }
1843 
1844 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
1845           typename DstScalar>
1846 void TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::MakeResultPaths() {
1847   RUY_CHECK_EQ(life_stage, LifeStage::kHasOtherParams);
1848 
1849   Path paths_bitfield = static_cast<Path>(GetHexIntEnvVarOrZero("PATHS"));
1850 
1851   if (paths_bitfield == Path::kNone) {
1852     // Use a dummy Context just to perform the resolution of specific runtime
1853     // enabled paths.
1854     Context context;
1855     paths_bitfield = get_ctx(&context)->GetRuntimeEnabledPaths();
1856   }
1857 
1858   // Disable the internal test-only variants of the StandardCpp path in
1859   // benchmarks
1860   if (benchmark) {
1861     paths_bitfield = paths_bitfield & kAllPaths;
1862   }
1863 
1864   // Disable the internal test-only variants of the StandardCpp path on large
1865   // tests.
1866   // This constant be large enough to exercise some interesting BlockMap logic,
1867   // small enough to avoid large test latency increases from running these
1868   // slow code paths on large matrix multiplications.
1869   const int kMaxSizeToTestInternalStandardCppVariants = 300;
1870   if (rows > kMaxSizeToTestInternalStandardCppVariants ||
1871       cols > kMaxSizeToTestInternalStandardCppVariants ||
1872       depth > kMaxSizeToTestInternalStandardCppVariants) {
1873     paths_bitfield = paths_bitfield & kAllPaths;
1874   }
1875 
1876   // Trim bits that don't correspond to a compiled path,
1877   // to allow specifying e.g. ffff to mean 'all paths' regardless of whether all
1878   // those bits exist as actual paths.
1879   paths_bitfield = paths_bitfield & kAllPathsIncludingInternalVariants;
1880   RUY_CHECK_NE(paths_bitfield, Path::kNone);
1881   paths = PathsBitfieldAsVector(paths_bitfield);
1882 
1883   // kReference is a special 'external path' that's always available.
1884   // It can still be disabled by NOEXT.
1885   if (!GetBoolEnvVarOrFalse("NOEXT")) {
1886     external_paths.push_back(ExternalPath::kReference);
1887   }
1888 
1889 #ifdef RUY_TEST_EXTERNAL_PATHS
1890 
1891   if (!GetBoolEnvVarOrFalse("NOEXT")) {
1892     if (SupportsGemmlowp<TestSet>::kValue) {
1893 #ifdef GEMMLOWP_SSE4
1894       const bool gemmlowp_supported =
1895           !mul_params.multiplier_fixedpoint_perchannel() &&
1896           mul_params.channel_dimension() == ChannelDimension::kRow;
1897 #else
1898       const bool gemmlowp_supported =
1899           mul_params.channel_dimension() == ChannelDimension::kRow;
1900 #endif
1901       if (gemmlowp_supported) {
1902         external_paths.push_back(ExternalPath::kGemmlowp);
1903       }
1904     }
1905     if (UsesSingleScalarType<TestSet>::kValue &&
1906         std::is_floating_point<AccumScalar>::value) {
1907       external_paths.push_back(ExternalPath::kEigen);
1908       if (layout_style == LayoutStyle::kUnstridedLinear) {
1909         external_paths.push_back(ExternalPath::kEigenTensor);
1910       }
1911 // We link against a generic BLAS target that only maps to OpenBLAS on specific
1912 // architectures.
1913 #if RUY_PLATFORM_ARM_32 || RUY_PLATFORM_ARM_64
1914       // OpenBLAS multi-threading is disabled, so avoid mixing single-threaded
1915       // and multi-threaded benchmark results.
1916       if (max_num_threads == 1 && !getenv("NO_OPENBLAS")) {
1917         external_paths.push_back(ExternalPath::kOpenBlas);
1918       }
1919 #endif
1920     }
1921   }
1922 
1923 #endif  // RUY_TEST_EXTERNAL_PATHS
1924 
1925   for (Path path : paths) {
1926     for (Tuning tuning : EnumerateTuningsForPath(path, benchmark)) {
1927       results.emplace_back(new TestResultType);
1928       TestResultType& result = *results.back();
1929       result.path = path;
1930       result.tuning = tuning;
1931       MakeRandom(rows, cols, dst_order, dst_zero_point, layout_style,
1932                  RandomRange::kGeneral, &result.storage_matrix);
1933     }
1934   }
1935 
1936   for (ExternalPath external_path : external_paths) {
1937     results.emplace_back(new TestResultType);
1938     TestResultType& result = *results.back();
1939     result.external_path = external_path;
1940     MakeRandom(rows, cols, dst_order, dst_zero_point, layout_style,
1941                RandomRange::kGeneral, &result.storage_matrix);
1942   }
1943 
1944   life_stage = LifeStage::kHasResultPaths;
1945 }
1946 
1947 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
1948           typename DstScalar>
1949 void TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::EvalResult(
1950     TestResult<DstScalar>* result) {
1951   RUY_CHECK(result->path != Path::kNone ||
1952             result->external_path != ExternalPath::kNone);
1953   if (result->path != Path::kNone) {
1954     EvalRuy(result);
1955   } else {
1956     EvalExternalPath(this, result);
1957   }
1958   const std::string& pathname = PathName(*result);
1959   if (std::find(CoveredPaths()->begin(), CoveredPaths()->end(), pathname) ==
1960       CoveredPaths()->end()) {
1961     CoveredPaths()->push_back(pathname);
1962   }
1963 }
1964 
1965 using f32 = float;
1966 using f64 = double;
1967 using u8 = std::uint8_t;
1968 using i8 = std::int8_t;
1969 using u16 = std::uint16_t;
1970 using i16 = std::int16_t;
1971 using u32 = std::uint32_t;
1972 using i32 = std::int32_t;
1973 using u64 = std::uint64_t;
1974 using i64 = std::int64_t;
1975 
1976 template <typename Scalar>
1977 const char* TypeName() {
1978   return nullptr;
1979 }
1980 
1981 #define RUY_TYPENAME(TYPE)              \
1982   template <>                           \
1983   inline const char* TypeName<TYPE>() { \
1984     return #TYPE;                       \
1985   }
1986 
1987 RUY_TYPENAME(f32)
1988 RUY_TYPENAME(f64)
1989 RUY_TYPENAME(u8)
1990 RUY_TYPENAME(i8)
1991 RUY_TYPENAME(u16)
1992 RUY_TYPENAME(i16)
1993 RUY_TYPENAME(u32)
1994 RUY_TYPENAME(i32)
1995 RUY_TYPENAME(u64)
1996 RUY_TYPENAME(i64)
1997 
1998 #undef RUY_TYPENAME
1999 
2000 template <typename Scalar>
2001 const char* SymmetryName(const Matrix<Scalar>& matrix) {
2002   if (matrix.zero_point() == SymmetricZeroPoint<Scalar>()) {
2003     return "symm";
2004   } else {
2005     return "asymm";
2006   }
2007 }
2008 
2009 template <typename Scalar>
2010 int StorageSize(const Matrix<Scalar>& matrix) {
2011   return sizeof(Scalar) * FlatSize(matrix.layout());
2012 }
2013 
2014 // Helper that replicates a buffer and gives out pointers to the replicas.
2015 // This is useful when one wants to traverse data so that it is cold in cache.
2016 // By having a sufficiently large value of num_repeats, one can ensure that the
2017 // working set covered by the replicas is greater than the cache size.
2018 template <typename T>
2019 class RepeatedBuffer {
2020  public:
2021   RepeatedBuffer() = default;
2022   void Init(const T* elems, int num_elems, int num_repeats) {
2023     buffers_.clear();
2024     allocator_.FreeAll();
2025     for (int i = 0; i < num_repeats; i++) {
2026       T* p;
2027       allocator_.Allocate(num_elems, &p);
2028       memcpy(p, elems, num_elems * sizeof(T));
2029       buffers_.push_back(p);
2030     }
2031   }
2032   T* Next() {
2033     T* ret = buffers_[current_];
2034     current_ = (current_ + 1) % buffers_.size();
2035     return ret;
2036   }
2037 
2038  private:
2039   Allocator allocator_;
2040   std::vector<T*> buffers_;
2041   int current_ = 0;
2042 };
2043 
2044 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
2045           typename DstScalar>
2046 void TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::Benchmark(
2047     TestResult<DstScalar>* result) {
2048   const bool cold = getenv("RUY_BENCHMARK_COLD");
2049   LhsScalar* orig_lhs_data = lhs.matrix.data();
2050   RhsScalar* orig_rhs_data = rhs.matrix.data();
2051   DstScalar* orig_dst_data = result->storage_matrix.matrix.data();
2052 
2053   int num_matmul_sets = 0;
2054 
2055   RepeatedBuffer<LhsScalar> cold_lhs;
2056   RepeatedBuffer<RhsScalar> cold_rhs;
2057   RepeatedBuffer<DstScalar> cold_dst;
2058 
2059   if (cold) {
2060     const int kWorkingSetSize = 100 << 20;
2061     const int each_matmul_set_size = StorageSize(lhs.matrix) +
2062                                      StorageSize(rhs.matrix) +
2063                                      StorageSize(result->storage_matrix.matrix);
2064     num_matmul_sets =
2065         (kWorkingSetSize + each_matmul_set_size - 1) / each_matmul_set_size;
2066 
2067     cold_lhs.Init(lhs.matrix.data(), FlatSize(lhs.matrix.layout()),
2068                   num_matmul_sets);
2069     cold_rhs.Init(rhs.matrix.data(), FlatSize(rhs.matrix.layout()),
2070                   num_matmul_sets);
2071     cold_dst.Init(result->storage_matrix.matrix.data(),
2072                   FlatSize(result->storage_matrix.matrix.layout()),
2073                   num_matmul_sets);
2074   }
2075   const bool record_pmu = GetBoolEnvVarOrFalse("RUY_BENCHMARK_PMU");
2076   int repeats = GetIntEnvVarOrZero("RUY_BENCHMARK_REPEATS");
2077   if (!repeats) {
2078     repeats = 4;
2079   }
2080   float benchmark_min_secs = GetFloatEnvVarOrZero("RUY_BENCHMARK_MIN_SECS");
2081   if (!benchmark_min_secs) {
2082     benchmark_min_secs = 0.5;
2083   }
2084 #ifdef RUY_PROFILER
2085   {
2086     const char* lhstype = TypeName<LhsScalar>();
2087     const char* lhssymm = SymmetryName(lhs.matrix);
2088     const char* rhstype = TypeName<RhsScalar>();
2089     const char* rhssymm = SymmetryName(rhs.matrix);
2090 
2091     printf("Profiling path=%s shape=(%dx%dx%d) lhs=(%s,%s) rhs=(%s,%s)\n",
2092            PathName(*result).c_str(), rows, depth, cols, lhstype, lhssymm,
2093            rhstype, rhssymm);
2094     ruy::profiler::ScopeProfile profile;
2095 #endif
2096 
2097   float latency = std::numeric_limits<float>::infinity();
2098   float l1_refill_rate = std::numeric_limits<float>::infinity();
2099   float l2_refill_rate = std::numeric_limits<float>::infinity();
2100   float l3_refill_rate = std::numeric_limits<float>::infinity();
2101   float l1tlb_refill_rate = std::numeric_limits<float>::infinity();
2102   float l2tlb_refill_rate = std::numeric_limits<float>::infinity();
2103   float mispred_rate = std::numeric_limits<float>::infinity();
2104   float frontend_stall_rate = std::numeric_limits<float>::infinity();
2105   float backend_stall_rate = std::numeric_limits<float>::infinity();
2106 
2107   for (int repeat = 0; repeat < repeats; repeat++) {
2108     auto& pmu_events = GlobalPmuEvents();
2109     if (record_pmu) {
2110       pmu_events.StartRecording();
2111     }
2112     TimePoint time_start = Now();
2113     TimePoint t = time_start;
2114     int iters = 0;
2115     int iters_at_a_time = 1;
2116     while (ToFloatSeconds(t - time_start) < benchmark_min_secs) {
2117       for (int i = 0; i < iters_at_a_time; i++) {
2118         if (cold) {
2119           lhs.matrix.set_data(cold_lhs.Next());
2120           rhs.matrix.set_data(cold_rhs.Next());
2121           result->storage_matrix.matrix.set_data(cold_dst.Next());
2122         }
2123         EvalResult(result);
2124         iters++;
2125       }
2126       iters_at_a_time *= 2;
2127       t = Now();
2128     }
2129     latency = std::min(
2130         latency, static_cast<float>(ToFloatSeconds(t - time_start) / iters));
2131     if (record_pmu) {
2132       pmu_events.StopRecording();
2133       const float normalization_factor =
2134           1.0f / (static_cast<float>(iters) * rows * cols * depth);
2135       l1_refill_rate = std::min(
2136           l1_refill_rate, pmu_events.L1RefillCount() * normalization_factor);
2137       l2_refill_rate = std::min(
2138           l2_refill_rate, pmu_events.L2RefillCount() * normalization_factor);
2139       l3_refill_rate = std::min(
2140           l3_refill_rate, pmu_events.L3RefillCount() * normalization_factor);
2141       l1tlb_refill_rate =
2142           std::min(l1tlb_refill_rate,
2143                    pmu_events.L1TLBRefillCount() * normalization_factor);
2144       l2tlb_refill_rate =
2145           std::min(l2tlb_refill_rate,
2146                    pmu_events.L2TLBRefillCount() * normalization_factor);
2147       mispred_rate =
2148           std::min(mispred_rate, pmu_events.BranchMispredictionCount() *
2149                                      normalization_factor);
2150       frontend_stall_rate =
2151           std::min(frontend_stall_rate,
2152                    pmu_events.FrontendStallCount() * normalization_factor);
2153       backend_stall_rate =
2154           std::min(backend_stall_rate,
2155                    pmu_events.BackendStallCount() * normalization_factor);
2156     }
2157   }
2158   result->latency = latency;
2159   if (record_pmu) {
2160     result->l1_refill_rate = l1_refill_rate;
2161     result->l2_refill_rate = l2_refill_rate;
2162     result->l3_refill_rate = l3_refill_rate;
2163     result->l1tlb_refill_rate = l1tlb_refill_rate;
2164     result->l2tlb_refill_rate = l2tlb_refill_rate;
2165     result->mispred_rate = mispred_rate;
2166     result->frontend_stall_rate = frontend_stall_rate;
2167     result->backend_stall_rate = backend_stall_rate;
2168   }
2169 
2170 #ifdef RUY_PROFILER
2171   }
2172   fflush(stdout);
2173 #endif
2174 
2175   if (cold) {
2176     lhs.matrix.set_data(orig_lhs_data);
2177     rhs.matrix.set_data(orig_rhs_data);
2178     memcpy(orig_dst_data, result->storage_matrix.matrix.data(),
2179            StorageSize(result->storage_matrix.matrix));
2180     result->storage_matrix.matrix.set_data(orig_dst_data);
2181   }
2182 }
2183 
2184 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
2185           typename DstScalar>
2186 void TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::Eval() {
2187   RUY_CHECK_EQ(life_stage, LifeStage::kHasResultPaths);
2188   for (auto& result : results) {
2189     if (benchmark) {
2190       Benchmark(result.get());
2191     } else {
2192       EvalResult(result.get());
2193     }
2194   }
2195   life_stage = LifeStage::kEvaluated;
2196 }
2197 
2198 template <typename Scalar>
2199 std::string DumpRegion(const Matrix<Scalar>& matrix, int center_row,
2200                        int center_col) {
2201   static constexpr int kRadius = 20;
2202   int first_row = std::max(0, center_row - kRadius);
2203   int last_row = std::min(matrix.layout().rows() - 1, center_row + kRadius);
2204   int first_col = std::max(0, center_col - kRadius);
2205   int last_col = std::min(matrix.layout().cols() - 1, center_col + kRadius);
2206   std::ostringstream stream;
2207   for (int row = first_row; row <= last_row; row++) {
2208     for (int col = first_col; col <= last_col; col++) {
2209       stream << static_cast<double>(Element(matrix, row, col)) << " ";
2210     }
2211     stream << "\n";
2212   }
2213   return stream.str();
2214 }
2215 
2216 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
2217           typename DstScalar>
2218 void TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::VerifyTestResults()
2219     const {
2220   const int depth = lhs.matrix.layout().cols();
2221   for (int i = 0; i < static_cast<int>(results.size()) - 1; i++) {
2222     if (!Agree(*results[i], *results[i + 1], depth)) {
2223       PrintPathsInAgreement(results, depth);
2224       ErrorAnalysis error_analysis;
2225       AnalyzeTestError(*this, i + 1, &error_analysis);
2226       std::cerr << "Shape: rows = " << rows << ", cols = " << cols
2227                 << ", depth = " << depth << std::endl;
2228       std::cerr << "Stats of the good result matrix: "
2229                 << StatsAsString(error_analysis.stats_good) << std::endl;
2230       std::cerr << "Stats of the bad result matrix:  "
2231                 << StatsAsString(error_analysis.stats_bad) << std::endl;
2232       if (static_cast<int>(error_analysis.error_rows.size()) < rows) {
2233         std::cerr << "Rows containing errors: "
2234                   << Join(error_analysis.error_rows) << std::endl;
2235       } else {
2236         std::cerr << "Errors found in ALL rows." << std::endl;
2237       }
2238       if (static_cast<int>(error_analysis.error_cols.size()) < cols) {
2239         std::cerr << "Cols containing errors: "
2240                   << Join(error_analysis.error_cols) << std::endl;
2241       } else {
2242         std::cerr << "Errors found in ALL cols." << std::endl;
2243       }
2244       std::cerr << "The first error occurs at row "
2245                 << error_analysis.row_of_first_error << ", col "
2246                 << error_analysis.col_of_first_error << std::endl;
2247       std::cerr << "Good value: " << error_analysis.first_error_good_value
2248                 << std::endl;
2249       std::cerr << "Bad value : " << error_analysis.first_error_bad_value
2250                 << std::endl;
2251       std::cerr << "Region of Good result matrix around first error:\n\n"
2252                 << DumpRegion(results[0]->storage_matrix.matrix,
2253                               error_analysis.row_of_first_error,
2254                               error_analysis.col_of_first_error)
2255                 << std::endl;
2256       std::cerr << "Region of Bad result matrix around first error:\n\n"
2257                 << DumpRegion(results[i + 1]->storage_matrix.matrix,
2258                               error_analysis.row_of_first_error,
2259                               error_analysis.col_of_first_error)
2260                 << std::endl;
2261       RUY_CHECK(false);
2262     }
2263   }
2264 }
2265 
2266 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
2267           typename DstScalar>
2268 void TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::Verify() {
2269   RUY_CHECK_EQ(life_stage, LifeStage::kEvaluated);
2270   VerifyTestResults();
2271   life_stage = LifeStage::kFinal;
2272 }
2273 
2274 template <typename TestSetType>
2275 void TestRCC(int rows, int depth, int cols) {
2276   TestSetType test_set;
2277   test_set.rows = rows;
2278   test_set.depth = depth;
2279   test_set.cols = cols;
2280   test_set.lhs_order = Order::kRowMajor;
2281   test_set.rhs_order = Order::kColMajor;
2282   test_set.dst_order = Order::kColMajor;
2283   test_set.layout_style = LayoutStyle::kUnstridedLinear;
2284   test_set.Run();
2285 }
2286 
2287 template <typename TestSetType>
2288 void TestNonRCC(int rows, int depth, int cols) {
2289   TestSetType test_set;
2290   test_set.rows = rows;
2291   test_set.depth = depth;
2292   test_set.cols = cols;
2293   test_set.lhs_order = Order::kColMajor;
2294   test_set.rhs_order = Order::kColMajor;
2295   test_set.dst_order = Order::kColMajor;
2296   test_set.layout_style = LayoutStyle::kUnstridedLinear;
2297   test_set.Run();
2298 }
2299 
2300 template <typename TestSetType>
2301 void TestLinearAllOrders(int rows, int depth, int cols) {
2302   const std::vector<Order> orders{Order::kColMajor, Order::kRowMajor};
2303 
2304   for (Order lhs_order : orders) {
2305     for (Order rhs_order : orders) {
2306       for (Order dst_order : orders) {
2307         TestSetType test_set;
2308         test_set.rows = rows;
2309         test_set.depth = depth;
2310         test_set.cols = cols;
2311         test_set.lhs_order = lhs_order;
2312         test_set.rhs_order = rhs_order;
2313         test_set.dst_order = dst_order;
2314         test_set.layout_style = LayoutStyle::kLinear;
2315         test_set.Run();
2316       }
2317     }
2318   }
2319 }
2320 
2321 }  // namespace ruy
2322 
2323 #endif  // RUY_RUY_TEST_H_
2324