1 /* Copyright 2019 Google LLC. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef RUY_RUY_TEST_H_
17 #define RUY_RUY_TEST_H_
18
19 #include <math.h>
20
21 #include <algorithm>
22 #include <cstddef>
23 #include <cstdint>
24 #include <cstdio>
25 #include <cstdlib>
26 #include <ctime>
27 #include <iostream>
28 #include <iterator>
29 #include <limits>
30 #include <memory>
31 #include <numeric>
32 #include <random>
33 #include <set>
34 #include <sstream>
35 #include <string>
36 #include <tuple>
37 #include <type_traits>
38 #include <vector>
39
40 #include "ruy/allocator.h"
41 #include "ruy/context.h"
42 #include "ruy/context_get_ctx.h"
43 #include "ruy/ctx.h"
44 #include "ruy/gtest_wrapper.h" // IWYU pragma: export
45 #include "ruy/matrix.h" // IWYU pragma: export
46 #include "ruy/mul_params.h" // IWYU pragma: export
47 #include "ruy/pack_common.h"
48 #include "ruy/platform.h"
49 #include "ruy/pmu.h"
50 #include "ruy/reference_mul.h"
51 #include "ruy/ruy.h"
52 #include "ruy/size_util.h"
53 #include "ruy/time.h"
54
55 #ifdef RUY_TEST_EXTERNAL_PATHS
56 #define EIGEN_USE_THREADS
57 #define EIGEN_USE_CUSTOM_THREAD_POOL
58 #pragma GCC diagnostic push
59 #pragma GCC diagnostic ignored "-Wunused-parameter"
60 #include "third_party/eigen3/Eigen/Core"
61 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
62 #pragma GCC diagnostic pop
63 #include "third_party/gemmlowp/public/gemmlowp.h"
64 #include "third_party/lapack/blas.h"
65 #endif
66
67 #ifdef RUY_PROFILER
68 #include "ruy/profiler/profiler.h"
69 #endif
70
71 #ifdef __linux__
72 #include <sys/mman.h>
73 #include <unistd.h>
74 #endif
75
76 namespace ruy {
77
78 const float kClampRatio = 0.1f;
79
80 enum class ExternalPath {
81 kNone,
82 kReference,
83 kGemmlowp,
84 kEigen,
85 kEigenTensor,
86 kOpenBlas
87 };
88
CoveredPaths()89 inline std::vector<std::string>* CoveredPaths() {
90 static std::vector<std::string> covered_paths;
91 return &covered_paths;
92 }
93
PathName(Path path)94 inline const char* PathName(Path path) {
95 #define RUY_PATHNAME_CASE(NAME) \
96 case Path::NAME: \
97 return #NAME;
98 switch (path) {
99 RUY_PATHNAME_CASE(kStandardCpp)
100 RUY_PATHNAME_CASE(kInternalStandardCppVariant1)
101 RUY_PATHNAME_CASE(kInternalStandardCppVariant2)
102 RUY_PATHNAME_CASE(kInternalStandardCppVariant3)
103
104 #if RUY_PLATFORM_NEON
105 RUY_PATHNAME_CASE(kNeon)
106 RUY_PATHNAME_CASE(kNeonDotprod)
107 #elif RUY_PLATFORM_X86
108 RUY_PATHNAME_CASE(kAvx2Fma)
109 RUY_PATHNAME_CASE(kAvx512)
110 RUY_PATHNAME_CASE(kAvx)
111 #endif
112 default:
113 RUY_CHECK(false);
114 return nullptr;
115 }
116 #undef RUY_PATHNAME_CASE
117 }
118
TuningName(Tuning tuning)119 inline const char* TuningName(Tuning tuning) {
120 #define RUY_SUBPATHNAME_CASE(NAME) \
121 case Tuning::NAME: \
122 return #NAME;
123 switch (tuning) {
124 RUY_SUBPATHNAME_CASE(kA55ish)
125 RUY_SUBPATHNAME_CASE(kX1)
126 RUY_SUBPATHNAME_CASE(kGeneric)
127 default:
128 RUY_CHECK(false);
129 return nullptr;
130 }
131 #undef RUY_SUBPATHNAME_CASE
132 }
133
PathName(ExternalPath path)134 inline const char* PathName(ExternalPath path) {
135 #define RUY_PATHNAME_CASE(NAME) \
136 case ExternalPath::NAME: \
137 return #NAME;
138 switch (path) {
139 RUY_PATHNAME_CASE(kReference)
140 RUY_PATHNAME_CASE(kGemmlowp)
141 RUY_PATHNAME_CASE(kEigen)
142 RUY_PATHNAME_CASE(kEigenTensor)
143 RUY_PATHNAME_CASE(kOpenBlas)
144 default:
145 RUY_CHECK(false);
146 return nullptr;
147 }
148 #undef RUY_PATHNAME_CASE
149 }
150
151 inline std::ostream& operator<<(std::ostream& stream, Path path) {
152 return stream << PathName(path);
153 }
154
155 inline std::ostream& operator<<(std::ostream& stream,
156 ExternalPath external_path) {
157 return stream << PathName(external_path);
158 }
159
160 template <typename ContainerType>
Join(const ContainerType & container)161 std::string Join(const ContainerType& container) {
162 if (container.empty()) {
163 return "<empty>";
164 }
165 std::ostringstream stream;
166 auto it = container.begin();
167 stream << *it++;
168 for (; it != container.end(); ++it) {
169 stream << ", ";
170 stream << *it;
171 }
172 return stream.str();
173 }
174
175 struct LogCoveredPathsOnDestruction final {
~LogCoveredPathsOnDestructionfinal176 ~LogCoveredPathsOnDestruction() {
177 std::cerr << "Covered paths: " << Join(*CoveredPaths()) << std::endl;
178
179 // When we know that it would be abnormal for some path not to be covered,
180 // we check it here. Accidentally disabling SIMD paths has occurred in the
181 // past and is one of the biggest performance regressions imaginable.
182 //
183 // TODO: we should be able to require some x86 paths as well, at least
184 // SSE4.2.
185
186 #if RUY_PLATFORM_ARM
187 // When testing on ARM32 or ARM64, make sure that we covered the NEON path.
188 // NEON is always available on ARM64, and we treat it as always available
189 // also on ARM32.
190 bool found_neon = false;
191 for (const std::string& covered_path : *CoveredPaths()) {
192 if (covered_path == "kNeon") {
193 found_neon = true;
194 }
195 }
196 if (!found_neon) {
197 std::cerr
198 << "Error: we haven't tested the kNeon path as we should have.\n"
199 << std::endl;
200 abort();
201 }
202 #endif
203
204 // When testing on ARM64 ChromiumOS emulator, make sure that we covered
205 // the dotprod path. We're getting such coverage at the moment thanks to
206 // using a sufficiently recent emulator, and we don't want to regress that.
207 #if RUY_PLATFORM_ARM_64 && defined RUY_TESTING_ON_CHROMIUMOS
208 bool found_dotprod = false;
209 for (const std::string& covered_path : *CoveredPaths()) {
210 if (covered_path == "kNeonDotprod") {
211 found_dotprod = true;
212 }
213 }
214 if (!found_dotprod) {
215 std::cerr
216 << "Error: we haven't tested the kNeonDotprod path as we should "
217 "have. At the moment, this is required on ChromiumOS as this is "
218 "what we run emulator tests in, that currently supports "
219 "dot-product "
220 "instructions, and we care very much about not regressing that. "
221 "If this test was run in an emulator, please upgrade to a newer "
222 "emulator version. If this test was run on an actual device, and "
223 "you need to be able to run ruy tests on devices not supporting "
224 "dot-product instructions, get in touch with us.\n"
225 << std::endl;
226 abort();
227 }
228 #endif
229 }
Singletonfinal230 static void Singleton() { static LogCoveredPathsOnDestruction singleton; }
231 };
232
233 enum class RandomRange {
234 kGeneral,
235 kAvoidMinValue,
236 kOffCenterAvoidMinValue,
237 kReasonableSrcZeroPoint,
238 kReasonableDstZeroPoint,
239 kBias
240 };
241
242 template <typename Scalar,
243 bool IsFloatingPoint = std::is_floating_point<Scalar>::value>
244 struct RandomRangeBounds {};
245
246 template <typename Scalar>
247 struct RandomRangeBounds<Scalar, true> {
248 static Scalar GetMinBound(RandomRange range) {
249 switch (range) {
250 case RandomRange::kGeneral:
251 return -1;
252 case RandomRange::kAvoidMinValue:
253 return -1;
254 case RandomRange::kOffCenterAvoidMinValue:
255 return -1;
256 case RandomRange::kReasonableSrcZeroPoint:
257 return 0;
258 case RandomRange::kReasonableDstZeroPoint:
259 return 0;
260 case RandomRange::kBias:
261 return -1;
262 default:
263 RUY_CHECK(false);
264 return 0;
265 }
266 }
267 static Scalar GetMaxBound(RandomRange range) {
268 switch (range) {
269 case RandomRange::kGeneral:
270 return 1;
271 case RandomRange::kAvoidMinValue:
272 return 1;
273 case RandomRange::kOffCenterAvoidMinValue:
274 return 1;
275 case RandomRange::kReasonableSrcZeroPoint:
276 return 0;
277 case RandomRange::kReasonableDstZeroPoint:
278 return 0;
279 case RandomRange::kBias:
280 return 1;
281 default:
282 RUY_CHECK(false);
283 return 0;
284 }
285 }
286 };
287
288 template <typename Scalar>
289 Scalar WeightedSum(Scalar s1, float weight1, Scalar s2, float weight2) {
290 float sum = s1 * weight1 + s2 * weight2;
291 float clamped = std::min<float>(
292 std::numeric_limits<Scalar>::max(),
293 std::max<float>(std::numeric_limits<Scalar>::lowest(), sum));
294 return static_cast<Scalar>(clamped);
295 }
296
297 template <typename Scalar>
298 Scalar Parametrized(float param) {
299 return WeightedSum(std::numeric_limits<Scalar>::max(), param,
300 std::numeric_limits<Scalar>::lowest(), 1 - param);
301 }
302
303 template <typename Scalar>
304 struct RandomRangeBounds<Scalar, false> {
305 static Scalar GetMinBound(RandomRange range) {
306 static constexpr double offcenteredness =
307 0.02; // Shift lower limit by about 5 for range of 255.
308 switch (range) {
309 case RandomRange::kGeneral:
310 return std::numeric_limits<Scalar>::lowest();
311 case RandomRange::kAvoidMinValue:
312 return 1 + std::numeric_limits<Scalar>::lowest();
313 case RandomRange::kOffCenterAvoidMinValue:
314 return 1 + std::numeric_limits<Scalar>::lowest() +
315 static_cast<Scalar>(
316 offcenteredness * std::numeric_limits<Scalar>::max() -
317 offcenteredness *
318 (std::numeric_limits<Scalar>::lowest() + 1));
319 case RandomRange::kReasonableSrcZeroPoint:
320 return std::numeric_limits<Scalar>::lowest();
321 case RandomRange::kReasonableDstZeroPoint:
322 return Parametrized<Scalar>(0.4);
323 case RandomRange::kBias:
324 return std::is_same<Scalar, std::int32_t>::value
325 ? static_cast<Scalar>(-10000)
326 : 0;
327 default:
328 RUY_CHECK(false);
329 return 0;
330 }
331 }
332 static Scalar GetMaxBound(RandomRange range) {
333 switch (range) {
334 case RandomRange::kGeneral:
335 return std::numeric_limits<Scalar>::max();
336 case RandomRange::kAvoidMinValue:
337 return std::numeric_limits<Scalar>::max();
338 case RandomRange::kOffCenterAvoidMinValue:
339 return std::numeric_limits<Scalar>::max();
340 case RandomRange::kReasonableSrcZeroPoint:
341 return std::numeric_limits<Scalar>::max();
342 case RandomRange::kReasonableDstZeroPoint:
343 return Parametrized<Scalar>(0.6);
344 case RandomRange::kBias:
345 return std::is_same<Scalar, std::int32_t>::value
346 ? static_cast<Scalar>(10000)
347 : 0;
348 default:
349 RUY_CHECK(false);
350 return 0;
351 }
352 }
353 };
354
355 inline std::default_random_engine& global_random_engine() {
356 static std::default_random_engine engine;
357 return engine;
358 }
359
360 template <typename Scalar>
361 struct UniformRandomDistribution {
362 UniformRandomDistribution(RandomRange range)
363 : dist(RandomRangeBounds<Scalar>::GetMinBound(range),
364 RandomRangeBounds<Scalar>::GetMaxBound(range)) {}
365 Scalar Get() { return dist(global_random_engine()); }
366 // std::uniform_int_distribution is specified not to support char types,
367 // only short and wider types. MSVC actually generates an error on
368 // std::uniform_int_distribution<std::int8_t>.
369 using StdDistType = typename std::conditional<
370 std::is_floating_point<Scalar>::value,
371 std::uniform_real_distribution<Scalar>,
372 std::uniform_int_distribution<std::int32_t>>::type;
373 StdDistType dist;
374 };
375
376 template <typename Scalar>
377 void MakeRandomScalar(UniformRandomDistribution<Scalar>* uniform_dist,
378 Scalar* dst) {
379 *dst = uniform_dist->Get();
380 }
381
382 #if defined(__has_feature)
383 #if __has_feature(address_sanitizer)
384 #define RUY_TEST_BUILT_WITH_ASAN
385 #endif
386 #endif
387
388 // Don't use separate mappings when building with Address Sanitizer, as the
389 // manual mappings could hide actual address errors from ASan (ASan can't see
390 // the actual buffer valid address range inside the manual mapping).
391 #if defined __linux__ && !defined RUY_TEST_BUILT_WITH_ASAN
392 #define RUY_TEST_USE_SEPARATE_MAPPINGS
393 #endif
394
395 template <typename T>
396 struct SeparateMappingAllocator {
397 using value_type = T;
398
399 T* allocate(std::size_t n) {
400 #ifdef RUY_TEST_USE_SEPARATE_MAPPINGS
401 const std::size_t page_size = getpagesize();
402 std::size_t buffer_size = n * sizeof(T);
403 std::size_t rounded_buffer_size = round_up_pot(buffer_size, page_size);
404 // We are going to map an additional page at the end of our buffer, then
405 // unmap it, to ensure that our buffer's end is the last mapped byte, so as
406 // to catch any overrun.
407 void* mapping =
408 mmap(nullptr, rounded_buffer_size + page_size, PROT_READ | PROT_WRITE,
409 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
410 RUY_CHECK_NE(mapping, MAP_FAILED);
411 int unmap_result =
412 munmap(static_cast<char*>(mapping) + rounded_buffer_size, page_size);
413 RUY_CHECK_EQ(unmap_result, 0);
414 // Clearing bytes should be redundant since mmap has do do it already, but
415 // it does not hurt and acts as an assertion checking that we got the above
416 // mapping and unmapping right.
417 std::memset(mapping, 0, rounded_buffer_size);
418 // Compute the offset to make our buffer end at the last mapped byte.
419 std::size_t buffer_offset = rounded_buffer_size - buffer_size;
420 void* buffer =
421 static_cast<void*>(static_cast<char*>(mapping) + buffer_offset);
422 return static_cast<T*>(buffer);
423 #else
424 T* ret = new T[n];
425 std::memset(ret, 0, n * sizeof(T));
426 return ret;
427 #endif
428 }
429 void deallocate(T* p, std::size_t n) {
430 #ifdef RUY_TEST_USE_SEPARATE_MAPPINGS
431 // The mapped bytes are the buffer address range, rounded on both ends
432 // to page boundary.
433 const std::size_t page_size = getpagesize();
434 std::size_t buffer_size = n * sizeof(T);
435 std::size_t rounded_buffer_size = round_up_pot(buffer_size, page_size);
436 std::uintptr_t p_addr = reinterpret_cast<std::uintptr_t>(p);
437 void* mapping = reinterpret_cast<void*>(p_addr & ~(page_size - 1));
438 int ret = munmap(mapping, rounded_buffer_size);
439 RUY_CHECK_EQ(ret, 0);
440 #else
441 (void)n;
442 delete[] p;
443 #endif
444 }
445 };
446
447 template <typename T>
448 using SeparateMappingVector = std::vector<T, SeparateMappingAllocator<T>>;
449
450 template <typename Scalar, typename Allocator>
451 void MakeRandomVector(UniformRandomDistribution<Scalar>* uniform_dist, int size,
452 std::vector<Scalar, Allocator>* dst) {
453 dst->resize(size);
454 for (auto& x : *dst) {
455 MakeRandomScalar(uniform_dist, &x);
456 }
457 }
458
459 template <typename Scalar>
460 void MakeRandomScalar(RandomRange range, Scalar* dst) {
461 UniformRandomDistribution<Scalar> dist(range);
462 *dst = dist.Get();
463 if (range == RandomRange::kReasonableDstZeroPoint ||
464 range == RandomRange::kReasonableSrcZeroPoint) {
465 if (global_random_engine()() & 1) {
466 *dst = SymmetricZeroPoint<Scalar>();
467 }
468 }
469 }
470
471 template <typename Scalar, typename Allocator>
472 void MakeRandomVector(RandomRange range, int size,
473 std::vector<Scalar, Allocator>* dst) {
474 UniformRandomDistribution<Scalar> dist(range);
475 dst->resize(size);
476 for (auto& x : *dst) {
477 MakeRandomScalar(&dist, &x);
478 }
479 }
480
481 enum class LayoutStyle { kUnstridedLinear, kLinear };
482
483 inline void MakeLayout(int rows, int cols, Order order,
484 LayoutStyle layout_style, Layout* layout) {
485 layout->set_rows(rows);
486 layout->set_cols(cols);
487 layout->set_order(order);
488
489 const int min_stride = order == Order::kColMajor ? rows : cols;
490
491 RUY_CHECK(layout_style == LayoutStyle::kUnstridedLinear ||
492 layout_style == LayoutStyle::kLinear);
493 if (layout_style == LayoutStyle::kUnstridedLinear) {
494 layout->set_stride(min_stride);
495 } else {
496 layout->set_stride(min_stride + 1);
497 }
498 }
499
500 template <typename Scalar>
501 struct StorageMatrix {
502 StorageMatrix() = default;
503 StorageMatrix(const StorageMatrix&) = delete;
504 SeparateMappingVector<Scalar> data;
505 Matrix<Scalar> matrix;
506 };
507
508 inline bool IsUnstrided(const Layout& layout) {
509 if (layout.order() == Order::kColMajor) {
510 return layout.stride() == layout.rows();
511 } else {
512 return layout.stride() == layout.cols();
513 }
514 }
515
516 inline bool IsRowMajor(const Layout& layout) {
517 return layout.order() == Order::kRowMajor;
518 }
519
520 inline bool IsColMajor(const Layout& layout) {
521 return layout.order() == Order::kColMajor;
522 }
523
524 inline int FlatSize(const Layout& layout) {
525 const int outerdim =
526 layout.order() == Order::kColMajor ? layout.cols() : layout.rows();
527 return layout.stride() * outerdim;
528 }
529
530 template <typename Scalar>
531 void VerifyConsistentFields(const StorageMatrix<Scalar>& storage_matrix) {
532 if (storage_matrix.data.empty()) {
533 RUY_CHECK_EQ(storage_matrix.matrix.data(), nullptr);
534 RUY_CHECK_EQ(storage_matrix.matrix.layout().rows(), 0);
535 RUY_CHECK_EQ(storage_matrix.matrix.layout().cols(), 0);
536 } else {
537 RUY_CHECK_EQ(storage_matrix.matrix.data(), storage_matrix.data.data());
538 RUY_CHECK_EQ(FlatSize(storage_matrix.matrix.layout()),
539 static_cast<int>(storage_matrix.data.size()));
540 }
541 }
542
543 template <typename Scalar>
544 void MakeRandom(int rows, int cols, Order order, Scalar zero_point,
545 LayoutStyle layout_style, RandomRange range,
546 StorageMatrix<Scalar>* storage_matrix) {
547 MakeLayout(rows, cols, order, layout_style,
548 storage_matrix->matrix.mutable_layout());
549 storage_matrix->matrix.set_zero_point(zero_point);
550 UniformRandomDistribution<Scalar> data_dist(range);
551 MakeRandomVector(&data_dist, FlatSize(storage_matrix->matrix.layout()),
552 &storage_matrix->data);
553 storage_matrix->matrix.set_data(storage_matrix->data.data());
554 VerifyConsistentFields(*storage_matrix);
555 }
556
557 template <typename Scalar>
558 struct TestResult {
559 void operator=(const TestResult&) = delete;
560 void operator=(const TestResult&&) = delete;
561 StorageMatrix<Scalar> storage_matrix;
562 Path path = Path::kNone;
563 Tuning tuning = Tuning::kAuto;
564 ExternalPath external_path = ExternalPath::kNone;
565 float latency;
566 float l1_refill_rate;
567 float l2_refill_rate;
568 float l3_refill_rate;
569 float l1tlb_refill_rate;
570 float l2tlb_refill_rate;
571 float mispred_rate;
572 float frontend_stall_rate;
573 float backend_stall_rate;
574 };
575
576 template <typename Scalar>
577 std::string PathName(const TestResult<Scalar>& result) {
578 std::string pathname;
579 if (result.path != Path::kNone) {
580 pathname.assign(PathName(result.path));
581 } else if (result.external_path != ExternalPath::kNone) {
582 pathname.assign(PathName(result.external_path));
583 } else {
584 RUY_CHECK(false);
585 }
586 if (result.tuning != Tuning::kAuto) {
587 pathname.append("/");
588 pathname.append(TuningName(result.tuning));
589 }
590 return pathname;
591 }
592
593 template <typename tLhsScalar, typename tRhsScalar, typename tAccumScalar,
594 typename tDstScalar>
595 struct TestSet final {
596 using LhsScalar = tLhsScalar;
597 using RhsScalar = tRhsScalar;
598 using AccumScalar = tAccumScalar;
599 using DstScalar = tDstScalar;
600 using MulParamsType = MulParams<AccumScalar, DstScalar>;
601 using TestResultType = TestResult<DstScalar>;
602
603 void Run() {
604 MakeZeroPoints();
605 MakeLhsRhs();
606 MakeMulParams();
607 MakeOtherParams();
608 MakeResultPaths();
609 Eval();
610 Verify();
611 }
612
613 private:
614 void MakeZeroPoints();
615 void MakeLhsRhs();
616 void MakeMulParams();
617 void MakeResultPaths();
618 void MakeOtherParams();
619 void EvalAndVerify();
620 void Eval();
621 void Verify();
622
623 void EvalResult(TestResultType* result);
624 void EvalRuy(TestResultType* result);
625 void DoMul(TestResultType* result);
626 void Benchmark(TestResultType* result);
627 void VerifyTestResults() const;
628
629 public:
630 enum class LifeStage {
631 kInitial,
632 kHasZeroPoints,
633 kHasLhsRhs,
634 kHasMulParams,
635 kHasOtherParams,
636 kHasResultPaths,
637 kEvaluated,
638 kFinal
639 };
640
641 ~TestSet();
642
643 LifeStage life_stage = LifeStage::kInitial;
644
645 int rows = 0;
646 int cols = 0;
647 int depth = 0;
648 Order lhs_order = Order::kRowMajor;
649 Order rhs_order = Order::kColMajor;
650 Order dst_order = Order::kColMajor;
651 LayoutStyle layout_style = LayoutStyle::kUnstridedLinear;
652
653 bool use_specified_zero_points = false;
654 LhsScalar lhs_zero_point = 0;
655 RhsScalar rhs_zero_point = 0;
656 DstScalar dst_zero_point = 0;
657
658 SeparateMappingVector<AccumScalar> per_channel_multiplier_fixedpoint;
659 SeparateMappingVector<int> per_channel_multiplier_exponent;
660
661 StorageMatrix<LhsScalar> lhs;
662 StorageMatrix<RhsScalar> rhs;
663 MulParamsType mul_params;
664 SeparateMappingVector<AccumScalar> bias_data;
665 std::vector<std::unique_ptr<TestResultType>> results;
666
667 std::vector<Path> paths;
668 std::vector<ExternalPath> external_paths;
669
670 bool benchmark = false;
671 bool perchannel = false;
672 int max_num_threads = 0;
673
674 bool cache_lhs = false;
675 bool cache_rhs = false;
676 };
677
678 inline PmuEvents& GlobalPmuEvents() {
679 static PmuEvents pmu;
680 return pmu;
681 }
682
683 inline Context& GlobalContext() {
684 // Ensure that GlobalPmuEvents is constructed before we create any context.
685 // This ensures that pmu counters are opened before we create any worker
686 // thread, which is necessary to count events from worker threads.
687 GlobalPmuEvents();
688
689 static Context context;
690 return context;
691 }
692
693 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
694 typename DstScalar>
695 TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::~TestSet() {
696 RUY_CHECK_EQ(life_stage, LifeStage::kFinal);
697 LogCoveredPathsOnDestruction::Singleton();
698 GlobalContext().ClearPrepackedCache();
699 }
700
701 #if defined(__has_feature)
702 #if __has_feature(thread_sanitizer)
703 #define RUY_TSAN
704 #endif
705 #if __has_feature(address_sanitizer)
706 #define RUY_ASAN
707 #endif
708 #endif // defined(__has_feature)
709
710 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
711 typename DstScalar>
712 void TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::DoMul(
713 TestResultType* result) {
714 Mul<kAllPathsIncludingInternalVariants>(lhs.matrix, rhs.matrix, mul_params,
715 &GlobalContext(),
716 &result->storage_matrix.matrix);
717 }
718
719 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
720 typename DstScalar>
721 void TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::EvalRuy(
722 TestResultType* result) {
723 GlobalContext().set_explicit_tuning(result->tuning);
724 if (max_num_threads) {
725 GlobalContext().set_max_num_threads(max_num_threads);
726 } else if (benchmark) {
727 GlobalContext().set_max_num_threads(1);
728 } else {
729 GlobalContext().set_max_num_threads(1 + global_random_engine()() % 8);
730 }
731 get_ctx(&GlobalContext())->SetRuntimeEnabledPaths(result->path);
732 DoMul(result);
733 // If enabling caching, Mul is stateful, so we run it a second time to get
734 // coverage of these aspects.
735 if (!benchmark && (cache_lhs || cache_rhs)) {
736 DoMul(result);
737 }
738 RUY_CHECK_EQ(GlobalContext().last_used_path(), result->path);
739 GlobalContext().set_explicit_tuning(Tuning::kAuto);
740 GlobalContext().set_max_num_threads(1);
741 }
742
743 #ifdef RUY_TEST_EXTERNAL_PATHS
744
745 template <typename Scalar, gemmlowp::MapOrder tOrder>
746 void WrapGemmlowp(const Matrix<Scalar>& src,
747 gemmlowp::MatrixMap<const Scalar, tOrder>* dst) {
748 RUY_CHECK(src.layout().order() == (tOrder == gemmlowp::MapOrder::ColMajor
749 ? Order::kColMajor
750 : Order::kRowMajor));
751 *dst = gemmlowp::MatrixMap<const Scalar, tOrder>(
752 src.data(), src.layout().rows(), src.layout().cols(),
753 src.layout().stride());
754 }
755
756 template <typename Scalar, gemmlowp::MapOrder tOrder>
757 void WrapGemmlowpMutable(Matrix<Scalar>* src,
758 gemmlowp::MatrixMap<Scalar, tOrder>* dst) {
759 RUY_CHECK(src->layout().order() == (tOrder == gemmlowp::MapOrder::ColMajor
760 ? Order::kColMajor
761 : Order::kRowMajor));
762 *dst = gemmlowp::MatrixMap<Scalar, tOrder>(src->data(), src->layout().rows(),
763 src->layout().cols(),
764 src->layout().stride());
765 }
766
767 template <Order tOrder>
768 struct GemmlowpOrder {};
769
770 template <>
771 struct GemmlowpOrder<Order::kColMajor> {
772 static constexpr gemmlowp::MapOrder kValue = gemmlowp::MapOrder::ColMajor;
773 };
774
775 template <>
776 struct GemmlowpOrder<Order::kRowMajor> {
777 static constexpr gemmlowp::MapOrder kValue = gemmlowp::MapOrder::RowMajor;
778 };
779
780 inline gemmlowp::GemmContext& GlobalGemmlowpContext() {
781 static gemmlowp::GemmContext context;
782 return context;
783 }
784
785 template <Order LhsOrder, Order RhsOrder, Order DstOrder, typename LhsScalar,
786 typename RhsScalar, typename DstScalar, typename MulParamsType>
787 void EvalGemmlowp(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
788 const MulParamsType& mul_params, int max_num_threads,
789 Matrix<DstScalar>* dst) {
790 static constexpr gemmlowp::MapOrder kGemmlowpLhsOrder =
791 GemmlowpOrder<LhsOrder>::kValue;
792 static constexpr gemmlowp::MapOrder kGemmlowpRhsOrder =
793 GemmlowpOrder<RhsOrder>::kValue;
794 static constexpr gemmlowp::MapOrder kGemmlowpDstOrder =
795 GemmlowpOrder<DstOrder>::kValue;
796 gemmlowp::MatrixMap<const LhsScalar, kGemmlowpLhsOrder> gemmlowp_lhs;
797 gemmlowp::MatrixMap<const RhsScalar, kGemmlowpRhsOrder> gemmlowp_rhs;
798 gemmlowp::MatrixMap<DstScalar, kGemmlowpDstOrder> gemmlowp_dst;
799 WrapGemmlowp(lhs, &gemmlowp_lhs);
800 WrapGemmlowp(rhs, &gemmlowp_rhs);
801 WrapGemmlowpMutable(dst, &gemmlowp_dst);
802
803 gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent quantize_down_stage;
804 quantize_down_stage.result_offset_after_shift = dst->zero_point();
805 quantize_down_stage.result_fixedpoint_multiplier =
806 mul_params.multiplier_fixedpoint();
807 quantize_down_stage.result_exponent = mul_params.multiplier_exponent();
808 gemmlowp::OutputStageScaleInt32ByFixedPointAndExponentPC<
809 gemmlowp::VectorShape::Col>
810 quantize_down_stage_pc;
811 quantize_down_stage_pc.result_offset_after_shift = dst->zero_point();
812 using ColVectorMap =
813 gemmlowp::VectorMap<const std::int32_t, gemmlowp::VectorShape::Col>;
814 quantize_down_stage_pc.result_fixedpoint_multiplier = ColVectorMap(
815 mul_params.multiplier_fixedpoint_perchannel(), lhs.layout().rows());
816 quantize_down_stage_pc.result_exponent = ColVectorMap(
817 mul_params.multiplier_exponent_perchannel(), lhs.layout().rows());
818
819 gemmlowp::OutputStageClamp clamp_stage;
820 clamp_stage.min = mul_params.clamp_min();
821 clamp_stage.max = mul_params.clamp_max();
822 using OutputStageSaturatingCast = typename std::conditional<
823 std::is_same<DstScalar, std::uint8_t>::value,
824 gemmlowp::OutputStageSaturatingCastToUint8,
825 gemmlowp::OutputStageSaturatingCastToInt16>::type;
826 OutputStageSaturatingCast saturating_cast_stage;
827
828 GlobalGemmlowpContext().set_max_num_threads(max_num_threads ? max_num_threads
829 : 1);
830 if (mul_params.bias()) {
831 using ColVectorMap =
832 gemmlowp::VectorMap<const std::int32_t, gemmlowp::VectorShape::Col>;
833 gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_add_stage;
834 bias_add_stage.bias_vector =
835 ColVectorMap(mul_params.bias(), dst->layout().rows());
836 #ifndef GEMMLOWP_SSE4 // gemmlowp perchannel stuff does not build on SSE
837 if (mul_params.multiplier_exponent_perchannel()) {
838 const auto& output_pipeline =
839 std::make_tuple(bias_add_stage, quantize_down_stage_pc, clamp_stage,
840 saturating_cast_stage);
841 gemmlowp::GemmWithOutputPipeline<
842 LhsScalar, DstScalar, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
843 &GlobalGemmlowpContext(), gemmlowp_lhs, gemmlowp_rhs, &gemmlowp_dst,
844 -lhs.zero_point(), -rhs.zero_point(), output_pipeline);
845 } else // NOLINT[readability/braces]
846 #endif
847 {
848 const auto& output_pipeline =
849 std::make_tuple(bias_add_stage, quantize_down_stage, clamp_stage,
850 saturating_cast_stage);
851 gemmlowp::GemmWithOutputPipeline<
852 LhsScalar, DstScalar, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
853 &GlobalGemmlowpContext(), gemmlowp_lhs, gemmlowp_rhs, &gemmlowp_dst,
854 -lhs.zero_point(), -rhs.zero_point(), output_pipeline);
855 }
856 } else {
857 #ifndef GEMMLOWP_SSE4 // gemmlowp perchannel stuff does not build on SSE
858 if (mul_params.multiplier_exponent_perchannel()) {
859 const auto& output_pipeline = std::make_tuple(
860 quantize_down_stage_pc, clamp_stage, saturating_cast_stage);
861 gemmlowp::GemmWithOutputPipeline<
862 LhsScalar, DstScalar, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
863 &GlobalGemmlowpContext(), gemmlowp_lhs, gemmlowp_rhs, &gemmlowp_dst,
864 -lhs.zero_point(), -rhs.zero_point(), output_pipeline);
865 } else // NOLINT[readability/braces]
866 #endif
867 {
868 const auto& output_pipeline = std::make_tuple(
869 quantize_down_stage, clamp_stage, saturating_cast_stage);
870 gemmlowp::GemmWithOutputPipeline<
871 LhsScalar, DstScalar, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
872 &GlobalGemmlowpContext(), gemmlowp_lhs, gemmlowp_rhs, &gemmlowp_dst,
873 -lhs.zero_point(), -rhs.zero_point(), output_pipeline);
874 }
875 }
876 }
877
878 inline constexpr int Mash(Order LhsOrder, Order RhsOrder, Order DstOrder) {
879 return (LhsOrder == Order::kRowMajor ? 4 : 0) +
880 (RhsOrder == Order::kRowMajor ? 2 : 0) +
881 (DstOrder == Order::kRowMajor ? 1 : 0);
882 }
883
884 template <typename LhsScalar, typename RhsScalar, typename DstScalar,
885 typename MulParamsType>
886 void EvalGemmlowp(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
887 const MulParamsType& mul_params, int max_num_threads,
888 Matrix<DstScalar>* dst) {
889 int index =
890 Mash(lhs.layout().order(), rhs.layout().order(), dst->layout().order());
891 switch (index) {
892 #define EVALGEMMLOWP_CASE3(LHS, RHS, DST) \
893 case Mash(LHS, RHS, DST): \
894 return EvalGemmlowp<LHS, RHS, DST>(lhs, rhs, mul_params, max_num_threads, \
895 dst);
896 #define EVALGEMMLOWP_CASE2(LHS, RHS) \
897 EVALGEMMLOWP_CASE3(LHS, RHS, Order::kColMajor) \
898 EVALGEMMLOWP_CASE3(LHS, RHS, Order::kRowMajor)
899 #define EVALGEMMLOWP_CASE1(LHS) \
900 EVALGEMMLOWP_CASE2(LHS, Order::kColMajor) \
901 EVALGEMMLOWP_CASE2(LHS, Order::kRowMajor)
902
903 EVALGEMMLOWP_CASE1(Order::kColMajor)
904 EVALGEMMLOWP_CASE1(Order::kRowMajor)
905
906 #undef EVALGEMMLOWP_CASE1
907 #undef EVALGEMMLOWP_CASE2
908 #undef EVALGEMMLOWP_CASE3
909
910 default:
911 RUY_CHECK(false);
912 }
913 }
914
915 template <Order tOrder>
916 struct EigenOrder {};
917
918 template <>
919 struct EigenOrder<Order::kColMajor> {
920 static constexpr int kValue = Eigen::ColMajor;
921 };
922
923 template <>
924 struct EigenOrder<Order::kRowMajor> {
925 static constexpr int kValue = Eigen::RowMajor;
926 };
927
928 template <Order LhsOrder, Order RhsOrder, Order DstOrder, typename LhsScalar,
929 typename RhsScalar, typename DstScalar, typename MulParamsType>
930 void EvalEigen(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
931 const MulParamsType& mul_params, int max_num_threads,
932 Matrix<DstScalar>* dst) {
933 RUY_CHECK_EQ(lhs.zero_point(), 0);
934 RUY_CHECK_EQ(rhs.zero_point(), 0);
935 RUY_CHECK_EQ(dst->zero_point(), 0);
936 RUY_CHECK_EQ(mul_params.multiplier_fixedpoint(), 0);
937 RUY_CHECK_EQ(mul_params.multiplier_exponent(), 0);
938
939 static constexpr int kEigenLhsOrder = EigenOrder<LhsOrder>::kValue;
940 static constexpr int kEigenRhsOrder = EigenOrder<RhsOrder>::kValue;
941 static constexpr int kEigenDstOrder = EigenOrder<DstOrder>::kValue;
942
943 using EigenLhsType = typename Eigen::Matrix<LhsScalar, Eigen::Dynamic,
944 Eigen::Dynamic, kEigenLhsOrder>::
945 template StridedConstMapType<Eigen::OuterStride<Eigen::Dynamic>>::type;
946 using EigenRhsType = typename Eigen::Matrix<RhsScalar, Eigen::Dynamic,
947 Eigen::Dynamic, kEigenRhsOrder>::
948 template StridedConstMapType<Eigen::OuterStride<Eigen::Dynamic>>::type;
949 using EigenDstType = typename Eigen::Matrix<DstScalar, Eigen::Dynamic,
950 Eigen::Dynamic, kEigenDstOrder>::
951 template StridedMapType<Eigen::OuterStride<Eigen::Dynamic>>::type;
952 using EigenBiasType =
953 typename Eigen::Matrix<DstScalar, Eigen::Dynamic, 1>::ConstMapType;
954
955 EigenLhsType eigen_lhs(
956 lhs.data(), lhs.layout().rows(), lhs.layout().cols(),
957 Eigen::OuterStride<Eigen::Dynamic>(lhs.layout().stride()));
958 EigenRhsType eigen_rhs(
959 rhs.data(), rhs.layout().rows(), rhs.layout().cols(),
960 Eigen::OuterStride<Eigen::Dynamic>(rhs.layout().stride()));
961 EigenDstType eigen_dst(
962 dst->data(), dst->layout().rows(), dst->layout().cols(),
963 Eigen::OuterStride<Eigen::Dynamic>(dst->layout().stride()));
964 Eigen::setNbThreads(max_num_threads ? max_num_threads : 1);
965
966 if (mul_params.bias()) {
967 EigenBiasType eigen_bias(mul_params.bias(), dst->layout().rows());
968 if (mul_params.clamp_max() == std::numeric_limits<DstScalar>::infinity() &&
969 mul_params.clamp_min() == -std::numeric_limits<DstScalar>::infinity()) {
970 eigen_dst.noalias() = (eigen_lhs * eigen_rhs).colwise() + eigen_bias;
971 } else {
972 eigen_dst.noalias() = ((eigen_lhs * eigen_rhs).colwise() + eigen_bias)
973 .cwiseMin(mul_params.clamp_max())
974 .cwiseMax(mul_params.clamp_min());
975 }
976 } else {
977 if (mul_params.clamp_max() == std::numeric_limits<DstScalar>::infinity() &&
978 mul_params.clamp_min() == -std::numeric_limits<DstScalar>::infinity()) {
979 eigen_dst.noalias() = eigen_lhs * eigen_rhs;
980 } else {
981 eigen_dst.noalias() = (eigen_lhs * eigen_rhs)
982 .cwiseMin(mul_params.clamp_max())
983 .cwiseMax(mul_params.clamp_min());
984 }
985 }
986 }
987
988 template <typename LhsScalar, typename RhsScalar, typename DstScalar,
989 typename MulParamsType>
990 void EvalEigen(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
991 const MulParamsType& mul_params, int max_num_threads,
992 Matrix<DstScalar>* dst) {
993 int index =
994 Mash(lhs.layout().order(), rhs.layout().order(), dst->layout().order());
995 switch (index) {
996 #define EVALEIGEN_CASE3(LHS, RHS, DST) \
997 case Mash(LHS, RHS, DST): \
998 return EvalEigen<LHS, RHS, DST>(lhs, rhs, mul_params, max_num_threads, dst);
999 #define EVALEIGEN_CASE2(LHS, RHS) \
1000 EVALEIGEN_CASE3(LHS, RHS, Order::kColMajor) \
1001 EVALEIGEN_CASE3(LHS, RHS, Order::kRowMajor)
1002 #define EVALEIGEN_CASE1(LHS) \
1003 EVALEIGEN_CASE2(LHS, Order::kColMajor) \
1004 EVALEIGEN_CASE2(LHS, Order::kRowMajor)
1005
1006 EVALEIGEN_CASE1(Order::kColMajor)
1007 EVALEIGEN_CASE1(Order::kRowMajor)
1008
1009 #undef EVALEIGEN_CASE1
1010 #undef EVALEIGEN_CASE2
1011 #undef EVALEIGEN_CASE3
1012
1013 default:
1014 RUY_CHECK(false);
1015 }
1016 }
1017
1018 template <Order LhsOrder, Order RhsOrder, Order DstOrder, typename Scalar,
1019 typename MulParamsType>
1020 void EvalEigenTensor(const Matrix<Scalar>& lhs, const Matrix<Scalar>& rhs,
1021 const MulParamsType& mul_params, int max_num_threads,
1022 Matrix<Scalar>* dst) {
1023 RUY_CHECK_EQ(lhs.zero_point(), 0);
1024 RUY_CHECK_EQ(rhs.zero_point(), 0);
1025 RUY_CHECK_EQ(dst->zero_point(), 0);
1026 RUY_CHECK_EQ(mul_params.multiplier_fixedpoint(), 0);
1027 RUY_CHECK_EQ(mul_params.multiplier_exponent(), 0);
1028
1029 // Eigen::TensorMap only supports unstrided layouts
1030 RUY_CHECK(IsUnstrided(lhs.layout()));
1031 RUY_CHECK(IsUnstrided(rhs.layout()));
1032 RUY_CHECK(IsUnstrided(dst->layout()));
1033
1034 using TensorLhsType =
1035 Eigen::TensorMap<Eigen::Tensor<const Scalar, 2, Eigen::ColMajor>>;
1036 using TensorRhsType =
1037 Eigen::TensorMap<Eigen::Tensor<const Scalar, 2, Eigen::ColMajor>>;
1038 using TensorDstType =
1039 Eigen::TensorMap<Eigen::Tensor<Scalar, 2, Eigen::ColMajor>>;
1040 using TensorBiasType =
1041 Eigen::TensorMap<Eigen::Tensor<const Scalar, 1, Eigen::ColMajor>>;
1042
1043 const bool tr = DstOrder == Order::kRowMajor;
1044 const auto& contract_lhs = tr ? rhs : lhs;
1045 const auto& contract_rhs = tr ? lhs : rhs;
1046
1047 TensorLhsType tensor_lhs(
1048 contract_lhs.data(),
1049 LhsOrder == Order::kColMajor ? contract_lhs.layout().rows()
1050 : contract_lhs.layout().cols(),
1051 LhsOrder == Order::kColMajor ? contract_lhs.layout().cols()
1052 : contract_lhs.layout().rows());
1053 TensorRhsType tensor_rhs(
1054 contract_rhs.data(),
1055 RhsOrder == Order::kColMajor ? contract_rhs.layout().rows()
1056 : contract_rhs.layout().cols(),
1057 RhsOrder == Order::kColMajor ? contract_rhs.layout().cols()
1058 : contract_rhs.layout().rows());
1059 TensorDstType tensor_dst(dst->data(),
1060 DstOrder == Order::kColMajor ? dst->layout().rows()
1061 : dst->layout().cols(),
1062 DstOrder == Order::kColMajor ? dst->layout().cols()
1063 : dst->layout().rows());
1064 using DimPair =
1065 typename Eigen::Tensor<Scalar, 1, 0, Eigen::Index>::DimensionPair;
1066 Eigen::array<DimPair, 1> contract_dims{
1067 {DimPair((LhsOrder == Order::kColMajor) ? 1 : 0,
1068 (RhsOrder == Order::kColMajor) ? 0 : 1)}};
1069 static Eigen::ThreadPool pool(max_num_threads ? max_num_threads : 1);
1070 static Eigen::ThreadPoolDevice device(&pool, pool.NumThreads());
1071 if (mul_params.bias()) {
1072 TensorBiasType tensor_bias(mul_params.bias(), dst->layout().rows());
1073 Eigen::array<int, 2> bias_2d_shape{tr ? 1 : dst->layout().rows(),
1074 tr ? dst->layout().rows() : 1};
1075 Eigen::array<int, 2> bcast{tr ? dst->layout().cols() : 1,
1076 tr ? 1 : dst->layout().cols()};
1077 if (mul_params.clamp_max() == std::numeric_limits<Scalar>::infinity() &&
1078 mul_params.clamp_min() == -std::numeric_limits<Scalar>::infinity()) {
1079 tensor_dst.device(device) =
1080 tensor_lhs.contract(tensor_rhs, contract_dims);
1081 } else {
1082 tensor_dst.device(device) =
1083 (tensor_lhs.contract(tensor_rhs, contract_dims) +
1084 tensor_bias.reshape(bias_2d_shape).broadcast(bcast))
1085 .cwiseMin(mul_params.clamp_max())
1086 .cwiseMax(mul_params.clamp_min());
1087 }
1088 } else {
1089 if (mul_params.clamp_max() == std::numeric_limits<Scalar>::infinity() &&
1090 mul_params.clamp_min() == -std::numeric_limits<Scalar>::infinity()) {
1091 tensor_dst.device(device) =
1092 tensor_lhs.contract(tensor_rhs, contract_dims);
1093 } else {
1094 tensor_dst.device(device) = tensor_lhs.contract(tensor_rhs, contract_dims)
1095 .cwiseMin(mul_params.clamp_max())
1096 .cwiseMax(mul_params.clamp_min());
1097 }
1098 }
1099 }
1100
1101 template <typename Scalar, typename MulParamsType>
1102 void EvalEigenTensor(const Matrix<Scalar>& lhs, const Matrix<Scalar>& rhs,
1103 const MulParamsType& mul_params, int max_num_threads,
1104 Matrix<Scalar>* dst) {
1105 int index =
1106 Mash(lhs.layout().order(), rhs.layout().order(), dst->layout().order());
1107 switch (index) {
1108 #define EVALEIGENTENSOR_CASE3(LHS, RHS, DST) \
1109 case Mash(LHS, RHS, DST): \
1110 return EvalEigenTensor<LHS, RHS, DST>(lhs, rhs, mul_params, \
1111 max_num_threads, dst);
1112 #define EVALEIGENTENSOR_CASE2(LHS, RHS) \
1113 EVALEIGENTENSOR_CASE3(LHS, RHS, Order::kColMajor) \
1114 EVALEIGENTENSOR_CASE3(LHS, RHS, Order::kRowMajor)
1115 #define EVALEIGENTENSOR_CASE1(LHS) \
1116 EVALEIGENTENSOR_CASE2(LHS, Order::kColMajor) \
1117 EVALEIGENTENSOR_CASE2(LHS, Order::kRowMajor)
1118
1119 EVALEIGENTENSOR_CASE1(Order::kColMajor)
1120 EVALEIGENTENSOR_CASE1(Order::kRowMajor)
1121
1122 #undef EVALEIGENTENSOR_CASE1
1123 #undef EVALEIGENTENSOR_CASE2
1124 #undef EVALEIGENTENSOR_CASE3
1125
1126 default:
1127 RUY_CHECK(false);
1128 }
1129 }
1130
1131 template <typename Scalar>
1132 struct GenericBlasGemm {};
1133
1134 template <>
1135 struct GenericBlasGemm<lapack::doublereal> {
1136 static void Run(char* transa, char* transb, lapack::integer* m,
1137 lapack::integer* n, lapack::integer* k,
1138 lapack::doublereal* alpha, lapack::doublereal* a,
1139 lapack::integer* lda, lapack::doublereal* b,
1140 lapack::integer* ldb, lapack::doublereal* beta,
1141 lapack::doublereal* c, lapack::integer* ldc) {
1142 dgemm_(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
1143 }
1144 };
1145
1146 template <>
1147 struct GenericBlasGemm<lapack::real> {
1148 static void Run(char* transa, char* transb, lapack::integer* m,
1149 lapack::integer* n, lapack::integer* k, lapack::real* alpha,
1150 lapack::real* a, lapack::integer* lda, lapack::real* b,
1151 lapack::integer* ldb, lapack::real* beta, lapack::real* c,
1152 lapack::integer* ldc) {
1153 sgemm_(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
1154 }
1155 };
1156
1157 inline void TransposeLayout(Layout* layout) {
1158 layout->set_order((layout->order() == Order::kRowMajor) ? Order::kColMajor
1159 : Order::kRowMajor);
1160 int tmp_rows = layout->rows();
1161 layout->set_rows(layout->cols());
1162 layout->set_cols(tmp_rows);
1163 }
1164
1165 template <typename Scalar>
1166 void Transpose(Matrix<Scalar>* matrix) {
1167 TransposeLayout(matrix->mutable_layout());
1168 }
1169
1170 template <typename Scalar, typename MulParamsType>
1171 void EvalOpenBlas(const Matrix<Scalar>& lhs, const Matrix<Scalar>& rhs,
1172 const MulParamsType& mul_params, int max_num_threads,
1173 Matrix<Scalar>* dst) {
1174 RUY_CHECK_EQ(lhs.zero_point(), 0);
1175 RUY_CHECK_EQ(rhs.zero_point(), 0);
1176 RUY_CHECK_EQ(dst->zero_point(), 0);
1177 RUY_CHECK_EQ(mul_params.multiplier_fixedpoint(), 0);
1178 RUY_CHECK_EQ(mul_params.multiplier_exponent(), 0);
1179
1180 Matrix<Scalar> gemm_lhs;
1181 Matrix<Scalar> gemm_rhs;
1182 Matrix<Scalar> gemm_dst;
1183 gemm_dst = *dst;
1184
1185 // Use Transpose to reduce to the all-column-major case.
1186 // Notice that ruy::Matrix merely holds a pointer, does not own data,
1187 // so Transpose is cheap -- no actual matrix data is being transposed here.
1188 if (dst->layout().order() == Order::kColMajor) {
1189 gemm_lhs = lhs;
1190 gemm_rhs = rhs;
1191 } else {
1192 gemm_lhs = rhs;
1193 gemm_rhs = lhs;
1194 Transpose(&gemm_lhs);
1195 Transpose(&gemm_rhs);
1196 Transpose(&gemm_dst);
1197 }
1198 bool transposed_lhs = false;
1199 bool transposed_rhs = false;
1200
1201 if (gemm_lhs.layout().order() == Order::kRowMajor) {
1202 Transpose(&gemm_lhs);
1203 transposed_lhs = true;
1204 }
1205 if (gemm_rhs.layout().order() == Order::kRowMajor) {
1206 Transpose(&gemm_rhs);
1207 transposed_rhs = true;
1208 }
1209
1210 RUY_CHECK_EQ(gemm_lhs.layout().order(), Order::kColMajor);
1211 RUY_CHECK_EQ(gemm_rhs.layout().order(), Order::kColMajor);
1212 RUY_CHECK_EQ(gemm_dst.layout().order(), Order::kColMajor);
1213
1214 char transa = transposed_lhs ? 'T' : 'N';
1215 char transb = transposed_rhs ? 'T' : 'N';
1216 int m = gemm_lhs.layout().rows();
1217 int n = gemm_rhs.layout().cols();
1218 int k = gemm_lhs.layout().cols();
1219 float alpha = 1;
1220 Scalar* a = gemm_lhs.data();
1221 int lda = gemm_lhs.layout().stride();
1222 Scalar* b = gemm_rhs.data();
1223 int ldb = gemm_rhs.layout().stride();
1224 float beta = 0;
1225 Scalar* c = gemm_dst.data();
1226 int ldc = gemm_dst.layout().stride();
1227 GenericBlasGemm<Scalar>::Run(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b,
1228 &ldb, &beta, c, &ldc);
1229
1230 // BLAS does not allow us to express the bias-addition and clamping, so
1231 // we use Eigen for that.
1232
1233 using EigenDstType =
1234 typename Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>::
1235 template StridedMapType<Eigen::OuterStride<Eigen::Dynamic>>::type;
1236 using EigenBiasType =
1237 typename Eigen::Matrix<Scalar, Eigen::Dynamic, 1>::ConstMapType;
1238
1239 EigenDstType eigen_dst(
1240 gemm_dst.data(), gemm_dst.layout().rows(), gemm_dst.layout().cols(),
1241 Eigen::OuterStride<Eigen::Dynamic>(gemm_dst.layout().stride()));
1242 Eigen::setNbThreads(max_num_threads ? max_num_threads : 1);
1243
1244 if (mul_params.bias()) {
1245 EigenBiasType eigen_bias(mul_params.bias(), dst->layout().rows());
1246 if (mul_params.clamp_max() == std::numeric_limits<Scalar>::infinity() &&
1247 mul_params.clamp_min() == -std::numeric_limits<Scalar>::infinity()) {
1248 eigen_dst.noalias() = eigen_dst.colwise() + eigen_bias;
1249 } else {
1250 eigen_dst.noalias() = (eigen_dst.colwise() + eigen_bias)
1251 .cwiseMin(mul_params.clamp_max())
1252 .cwiseMax(mul_params.clamp_min());
1253 }
1254 } else {
1255 if (mul_params.clamp_max() == std::numeric_limits<Scalar>::infinity() &&
1256 mul_params.clamp_min() == -std::numeric_limits<Scalar>::infinity()) {
1257 } else {
1258 eigen_dst.noalias() = eigen_dst.cwiseMin(mul_params.clamp_max())
1259 .cwiseMax(mul_params.clamp_min());
1260 }
1261 }
1262 }
1263
1264 template <typename TestSetType>
1265 struct SupportsGemmlowp {
1266 static constexpr bool kValue =
1267 std::is_same<typename TestSetType::LhsScalar, std::uint8_t>::value &&
1268 std::is_same<typename TestSetType::RhsScalar, std::uint8_t>::value;
1269 };
1270
1271 template <typename TestSetType>
1272 struct UsesSingleScalarType {
1273 static constexpr bool kValue =
1274 std::is_same<typename TestSetType::DstScalar,
1275 typename TestSetType::LhsScalar>::value &&
1276 std::is_same<typename TestSetType::DstScalar,
1277 typename TestSetType::RhsScalar>::value &&
1278 std::is_same<typename TestSetType::DstScalar,
1279 typename TestSetType::AccumScalar>::value;
1280 };
1281
1282 template <typename TestSetType,
1283 bool IsFloatingPoint =
1284 std::is_floating_point<typename TestSetType::AccumScalar>::value,
1285 bool EnableGemmlowp = SupportsGemmlowp<TestSetType>::kValue,
1286 bool SingleScalarType = UsesSingleScalarType<TestSetType>::kValue>
1287 struct EvalExternalPathImpl {
1288 using DstScalar = typename TestSetType::DstScalar;
1289 static void Run(TestSetType*, TestResult<DstScalar>*) { RUY_CHECK(false); }
1290 };
1291
1292 template <typename TestSetType>
1293 struct EvalExternalPathImpl<TestSetType, true, false, true> {
1294 using DstScalar = typename TestSetType::DstScalar;
1295 static void Run(TestSetType* test_set, TestResult<DstScalar>* test_result) {
1296 if (test_result->external_path == ExternalPath::kEigen) {
1297 EvalEigen(test_set->lhs.matrix, test_set->rhs.matrix,
1298 test_set->mul_params, test_set->max_num_threads,
1299 &test_result->storage_matrix.matrix);
1300 } else if (test_result->external_path == ExternalPath::kEigenTensor) {
1301 EvalEigenTensor(test_set->lhs.matrix, test_set->rhs.matrix,
1302 test_set->mul_params, test_set->max_num_threads,
1303 &test_result->storage_matrix.matrix);
1304 } else if (test_result->external_path == ExternalPath::kOpenBlas) {
1305 EvalOpenBlas(test_set->lhs.matrix, test_set->rhs.matrix,
1306 test_set->mul_params, test_set->max_num_threads,
1307 &test_result->storage_matrix.matrix);
1308 } else {
1309 RUY_CHECK(false);
1310 }
1311 }
1312 };
1313
1314 template <typename TestSetType, bool SingleScalarType>
1315 struct EvalExternalPathImpl<TestSetType, false, true, SingleScalarType> {
1316 using DstScalar = typename TestSetType::DstScalar;
1317 static void Run(TestSetType* test_set, TestResult<DstScalar>* test_result) {
1318 if (test_result->external_path == ExternalPath::kGemmlowp) {
1319 EvalGemmlowp(test_set->lhs.matrix, test_set->rhs.matrix,
1320 test_set->mul_params, test_set->max_num_threads,
1321 &test_result->storage_matrix.matrix);
1322 } else {
1323 RUY_CHECK(false);
1324 }
1325 }
1326 };
1327
1328 #endif // RUY_TEST_EXTERNAL_PATHS
1329
1330 template <typename TestSetType>
1331 void EvalExternalPath(
1332 TestSetType* test_set,
1333 TestResult<typename TestSetType::DstScalar>* test_result) {
1334 if (test_result->external_path == ExternalPath::kReference) {
1335 // kReference is special because it's always available (the implementation
1336 // is provided by ruy) and supports all cases (quantized and float).
1337 ruy::ReferenceMul(test_set->lhs.matrix, test_set->rhs.matrix,
1338 test_set->mul_params,
1339 &test_result->storage_matrix.matrix);
1340 } else {
1341 #ifdef RUY_TEST_EXTERNAL_PATHS
1342 EvalExternalPathImpl<TestSetType>::Run(test_set, test_result);
1343 #endif // RUY_TEST_EXTERNAL_PATHS
1344 }
1345 }
1346
1347 template <typename Scalar>
1348 bool Agree(ExternalPath external_path1, const Matrix<Scalar>& matrix1,
1349 ExternalPath external_path2, const Matrix<Scalar>& matrix2,
1350 int depth) {
1351 RUY_CHECK_EQ(matrix1.layout().rows(), matrix2.layout().rows());
1352 RUY_CHECK_EQ(matrix1.layout().cols(), matrix2.layout().cols());
1353 RUY_CHECK_EQ(matrix1.zero_point(), matrix2.zero_point());
1354 const int size = matrix1.layout().rows() * matrix1.layout().cols();
1355 double tolerated_max_diff = 0;
1356 double tolerated_mean_diff = 0;
1357 const float kSmallestAllowedDifference =
1358 4. * std::numeric_limits<Scalar>::epsilon();
1359 if (std::is_floating_point<Scalar>::value) {
1360 double max_abs_val = 0;
1361 for (int row = 0; row < matrix1.layout().rows(); row++) {
1362 for (int col = 0; col < matrix1.layout().cols(); col++) {
1363 max_abs_val =
1364 std::max(max_abs_val,
1365 std::abs(static_cast<double>(Element(matrix1, row, col))));
1366 max_abs_val =
1367 std::max(max_abs_val,
1368 std::abs(static_cast<double>(Element(matrix2, row, col))));
1369 }
1370 }
1371 tolerated_max_diff = max_abs_val * std::numeric_limits<Scalar>::epsilon() *
1372 64 * std::sqrt(static_cast<float>(depth));
1373 if (tolerated_max_diff < kSmallestAllowedDifference) {
1374 // Clamp the tolerated max diff to be a bit above machine epsilon if the
1375 // calculated value is too small.
1376 tolerated_max_diff = kSmallestAllowedDifference;
1377 if (external_path1 == ExternalPath::kEigen ||
1378 external_path2 == ExternalPath::kEigen ||
1379 external_path1 == ExternalPath::kEigenTensor ||
1380 external_path2 == ExternalPath::kEigenTensor) {
1381 // Make additional allowance for Eigen differences.
1382 tolerated_max_diff *= 10.0f;
1383 }
1384 }
1385 tolerated_mean_diff = tolerated_max_diff / std::sqrt(size);
1386 } else if (std::is_same<Scalar, std::int32_t>::value) {
1387 // raw integer case, no rounding, so we can require exactness
1388 tolerated_max_diff = 0;
1389 tolerated_mean_diff = 0;
1390 } else {
1391 // quantized case, with rounding errors in downscaling int32 accumulators
1392 // to final 8bit or 16bit values.
1393
1394 if (external_path1 != ExternalPath::kNone ||
1395 external_path2 != ExternalPath::kNone) {
1396 // In this case, we are comparing against some other library than ruy.
1397 //
1398 // We may have to tolerate an error of +/- 1 from using different
1399 // rounding in fixed-point multiplication, and then again an error of +/-
1400 // 1 from using different rounding in right shifts, so the tolerance on
1401 // the difference may have to be as large as 2.
1402 tolerated_max_diff = 2;
1403 } else if (RUY_PLATFORM_ARM) {
1404 // All our code paths on ARM (32 and 64-bit) are bit-exact
1405 // with the reference code (by design of the reference code).
1406 tolerated_max_diff = 0;
1407 } else if (RUY_PLATFORM_X86) {
1408 // Our reference and ARM paths have diverged from x86 paths in PR #227.
1409 // TODO: update the x86 path to adapt to that and reset that tolerance
1410 // to 0.
1411 tolerated_max_diff = 1;
1412 } else {
1413 // Other architectures, which we don't have dedicated code paths for
1414 // at the moment.
1415 // TODO: try resetting that tolerance to 0, since by
1416 // definition we're only using non-optimized code paths here.
1417 tolerated_max_diff = 1;
1418 }
1419
1420 // totally empirical
1421 tolerated_mean_diff = std::min(1.0, 2.0 * std::pow(size, -0.18));
1422 }
1423 double sum_diff = 0;
1424 for (int row = 0; row < matrix1.layout().rows(); row++) {
1425 for (int col = 0; col < matrix1.layout().cols(); col++) {
1426 double elem1 = Element(matrix1, row, col);
1427 double elem2 = Element(matrix2, row, col);
1428 double diff = elem1 - elem2;
1429
1430 sum_diff += diff;
1431 // Test (std::abs(diff) > tolerated_max_diff), but also true if diff is
1432 // NaN.
1433 if (!(std::abs(diff) <= tolerated_max_diff)) {
1434 return false;
1435 }
1436 }
1437 }
1438 double mean_diff = sum_diff / size;
1439 if (std::abs(mean_diff) > tolerated_mean_diff) {
1440 return false;
1441 }
1442 return true;
1443 }
1444
1445 template <typename Scalar>
1446 bool Agree(ExternalPath external_path1,
1447 const StorageMatrix<Scalar>& storage_matrix1,
1448 ExternalPath external_path2,
1449 const StorageMatrix<Scalar>& storage_matrix2, int depth) {
1450 VerifyConsistentFields(storage_matrix1);
1451 VerifyConsistentFields(storage_matrix2);
1452 return Agree(external_path1, storage_matrix1.matrix, external_path2,
1453 storage_matrix2.matrix, depth);
1454 }
1455
1456 template <typename Scalar>
1457 bool Agree(const TestResult<Scalar>& result1, const TestResult<Scalar>& result2,
1458 int depth) {
1459 return Agree(result1.external_path, result1.storage_matrix,
1460 result2.external_path, result2.storage_matrix, depth);
1461 }
1462
1463 template <typename Scalar>
1464 void AddTestResultToCluster(
1465 TestResult<Scalar>** result,
1466 std::vector<std::vector<TestResult<Scalar>*>>& clusters, int depth) {
1467 bool inserted = false;
1468 for (auto& cluster : clusters) {
1469 bool agreement = true;
1470 // Test for agreement with every result in the cluster.
1471 for (auto& other_result : cluster) {
1472 agreement &= Agree(**result, *other_result, depth);
1473 }
1474 if (agreement) {
1475 cluster.push_back(*result);
1476 inserted = true;
1477 }
1478 }
1479 if (!inserted) {
1480 std::vector<TestResult<Scalar>*> new_results;
1481 new_results.push_back(*result);
1482 clusters.push_back(new_results);
1483 }
1484 }
1485
1486 template <typename Scalar>
1487 void PrintPathsInAgreement(
1488 const std::vector<std::unique_ptr<TestResult<Scalar>>>& results,
1489 int depth) {
1490 // A container holding vectors of TestResults, where membership indicates
1491 // that all TestResults agree with each other.
1492 std::vector<std::vector<TestResult<Scalar>*>> clusters;
1493 for (const auto& result : results) {
1494 TestResult<Scalar>* test_result = result.get();
1495 AddTestResultToCluster(&test_result, clusters, depth);
1496 }
1497
1498 std::cerr << "Error: Not all paths agree. \n";
1499 for (auto& cluster : clusters) {
1500 std::cerr << "These paths all agree with each other: ";
1501 for (auto& result : cluster) {
1502 std::cerr << PathName(*result) << ", ";
1503 }
1504 std::cerr << "but disagree with the rest.\n";
1505 }
1506 }
1507
1508 struct Stats {
1509 double median;
1510 double mean;
1511 double min;
1512 double max;
1513 };
1514
1515 inline std::string StatsAsString(const Stats& stats) {
1516 char buf[256];
1517 snprintf(buf, sizeof(buf), "(median = %g, mean = %g, min = %g, max = %g)",
1518 stats.median, stats.mean, stats.min, stats.max);
1519 return std::string(buf);
1520 }
1521
1522 template <typename Scalar>
1523 void GetMatrixStats(const Matrix<Scalar>& matrix, Stats* stats) {
1524 double min = std::numeric_limits<double>::infinity();
1525 double max = -std::numeric_limits<double>::infinity();
1526 double sum = 0;
1527 std::vector<double> allvals;
1528 for (int row = 0; row < matrix.layout().rows(); row++) {
1529 for (int col = 0; col < matrix.layout().cols(); col++) {
1530 double val = Element(matrix, row, col);
1531 min = std::min(min, val);
1532 max = std::max(max, val);
1533 sum += val;
1534 allvals.push_back(val);
1535 }
1536 }
1537 std::sort(allvals.begin(), allvals.end());
1538 stats->min = min;
1539 stats->max = max;
1540 stats->mean = sum / allvals.size();
1541 stats->median = allvals[allvals.size() / 2];
1542 }
1543
1544 struct ErrorAnalysis {
1545 Stats stats_good;
1546 Stats stats_bad;
1547 // The below is to help document departure from bit exactness. It's probably
1548 // not going to be relevant to floating-point.
1549 std::set<int> error_rows;
1550 std::set<int> error_cols;
1551 int row_of_first_error = 0;
1552 int col_of_first_error = 0;
1553 double first_error_good_value = 0;
1554 double first_error_bad_value = 0;
1555 };
1556
1557 template <typename TestSetType>
1558 void AnalyzeTestError(const TestSetType& test_set, int first_bad_result_index,
1559 ErrorAnalysis* error_analysis) {
1560 const auto& good_matrix = test_set.results[0]->storage_matrix.matrix;
1561 const auto& bad_matrix =
1562 test_set.results[first_bad_result_index]->storage_matrix.matrix;
1563 GetMatrixStats(good_matrix, &error_analysis->stats_good);
1564 GetMatrixStats(bad_matrix, &error_analysis->stats_bad);
1565 bool found_first_error = false;
1566 for (int row = 0; row < good_matrix.layout().rows(); row++) {
1567 for (int col = 0; col < good_matrix.layout().cols(); col++) {
1568 if (Element(good_matrix, row, col) != Element(bad_matrix, row, col)) {
1569 if (!found_first_error) {
1570 found_first_error = true;
1571 error_analysis->row_of_first_error = row;
1572 error_analysis->col_of_first_error = col;
1573 error_analysis->first_error_good_value =
1574 Element(good_matrix, row, col);
1575 error_analysis->first_error_bad_value = Element(bad_matrix, row, col);
1576 }
1577 error_analysis->error_rows.insert(row);
1578 error_analysis->error_cols.insert(col);
1579 }
1580 }
1581 }
1582 }
1583
1584 template <typename TestSetType>
1585 void ComputeReasonableMultiplier(
1586 const Matrix<typename TestSetType::LhsScalar>& lhs,
1587 const Matrix<typename TestSetType::RhsScalar>&, double* multiplier) {
1588 using LhsScalar = typename TestSetType::LhsScalar;
1589 using RhsScalar = typename TestSetType::RhsScalar;
1590 using DstScalar = typename TestSetType::DstScalar;
1591 if (std::is_floating_point<DstScalar>::value ||
1592 std::is_same<DstScalar, std::int32_t>::value) {
1593 *multiplier = 0;
1594 return;
1595 }
1596 *multiplier = static_cast<double>(std::numeric_limits<DstScalar>::max()) /
1597 (static_cast<double>(lhs.layout().cols()) *
1598 std::numeric_limits<LhsScalar>::max() *
1599 std::numeric_limits<RhsScalar>::max());
1600 }
1601
1602 inline void QuantizeMultiplier(double multiplier_double,
1603 std::int32_t* multiplier_fixedpoint,
1604 int* multiplier_exponent) {
1605 RUY_CHECK_GT(multiplier_double, 0);
1606 if (multiplier_double == 0.) {
1607 *multiplier_fixedpoint = 0;
1608 *multiplier_exponent = 0;
1609 return;
1610 }
1611 const double q = std::frexp(multiplier_double, multiplier_exponent);
1612 auto q_fixed = static_cast<std::int64_t>(std::round(q * (1ll << 31)));
1613 RUY_CHECK_LE(q_fixed, (1ll << 31));
1614 if (q_fixed == (1ll << 31)) {
1615 q_fixed /= 2;
1616 ++*multiplier_exponent;
1617 }
1618 RUY_CHECK_LE(q_fixed, std::numeric_limits<std::int32_t>::max());
1619 *multiplier_fixedpoint = static_cast<std::int32_t>(q_fixed);
1620 }
1621
1622 template <typename TestSetType>
1623 void SwitchMultiplierToPerChannel(TestSetType* test_set) {
1624 ChannelDimension channel_dimension =
1625 (test_set->benchmark || global_random_engine()() % 2)
1626 ? ChannelDimension::kRow
1627 : ChannelDimension::kCol;
1628 test_set->mul_params.set_channel_dimension(channel_dimension);
1629 const int num_channels = channel_dimension == ChannelDimension::kRow
1630 ? test_set->rows
1631 : test_set->cols;
1632 test_set->per_channel_multiplier_fixedpoint.resize(num_channels);
1633 test_set->per_channel_multiplier_exponent.resize(num_channels);
1634 for (int i = 0; i < num_channels; i++) {
1635 // multipliers typically range in [2^30 ; 2^31 - 1].
1636 // Values in [0, 2^30 - 1] are normally unused, but harmless.
1637 // Thus a good way to randomize multipliers is to subtract from them
1638 // a random value smaller than 2^30 but still significant compared to it.
1639 std::int32_t nudged_multiplier =
1640 test_set->mul_params.multiplier_fixedpoint() -
1641 (global_random_engine()() % (1 << 26));
1642 int nudged_exponent = test_set->mul_params.multiplier_exponent() - 1 +
1643 (global_random_engine()() % 4);
1644 test_set->per_channel_multiplier_fixedpoint[i] = nudged_multiplier;
1645 test_set->per_channel_multiplier_exponent[i] = nudged_exponent;
1646 }
1647 test_set->mul_params.set_multiplier_fixedpoint(0);
1648 test_set->mul_params.set_multiplier_exponent(0);
1649 test_set->mul_params.set_multiplier_fixedpoint_perchannel(
1650 test_set->per_channel_multiplier_fixedpoint.data());
1651 test_set->mul_params.set_multiplier_exponent_perchannel(
1652 test_set->per_channel_multiplier_exponent.data());
1653 }
1654
1655 template <
1656 typename TestSetType,
1657 bool IsApplicable =
1658 std::is_same<typename TestSetType::AccumScalar, std::int32_t>::value &&
1659 !std::is_same<typename TestSetType::DstScalar, std::int32_t>::value>
1660 struct MakeSpecMultiplierFieldsImpl {};
1661
1662 template <typename TestSetType>
1663 struct MakeSpecMultiplierFieldsImpl<TestSetType, true> {
1664 static void Run(TestSetType* test_set) {
1665 double multiplier;
1666 ComputeReasonableMultiplier<TestSetType>(test_set->lhs.matrix,
1667 test_set->rhs.matrix, &multiplier);
1668 typename TestSetType::AccumScalar multiplier_fixedpoint;
1669 int multiplier_exponent;
1670 QuantizeMultiplier(multiplier, &multiplier_fixedpoint,
1671 &multiplier_exponent);
1672 test_set->mul_params.set_multiplier_fixedpoint(multiplier_fixedpoint);
1673 test_set->mul_params.set_multiplier_exponent(multiplier_exponent);
1674
1675 if (!test_set->benchmark) {
1676 test_set->perchannel = global_random_engine()() & 1;
1677 }
1678 if (test_set->perchannel) {
1679 SwitchMultiplierToPerChannel(test_set);
1680 }
1681 }
1682 };
1683
1684 template <typename TestSetType>
1685 struct MakeSpecMultiplierFieldsImpl<TestSetType, false> {
1686 static void Run(TestSetType*) {}
1687 };
1688
1689 template <typename MulParamsType>
1690 void MakeSpecClampFields(MulParamsType* mul_params) {
1691 using DstScalar = typename MulParamsType::DstScalar;
1692
1693 if (getenv("BENCHMARK_ONLY_MATMUL")) {
1694 if (std::is_floating_point<DstScalar>::value) {
1695 mul_params->set_clamp_min(-std::numeric_limits<DstScalar>::infinity());
1696 mul_params->set_clamp_max(std::numeric_limits<DstScalar>::infinity());
1697 } else {
1698 mul_params->set_clamp_min(std::numeric_limits<DstScalar>::lowest());
1699 mul_params->set_clamp_max(std::numeric_limits<DstScalar>::max());
1700 }
1701 return;
1702 }
1703
1704 mul_params->set_clamp_min(std::numeric_limits<DstScalar>::lowest() + 1);
1705 mul_params->set_clamp_max(std::numeric_limits<DstScalar>::max() - 1);
1706 }
1707
1708 void MakeSpecClampFields(MulParams<std::int32_t, std::int32_t>*) {
1709 // Returning raw accumulators, clamping is not supported.
1710 }
1711
1712 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
1713 typename DstScalar>
1714 void TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::MakeZeroPoints() {
1715 RUY_CHECK_EQ(life_stage, LifeStage::kInitial);
1716 if (std::is_same<LhsScalar, std::int16_t>::value ||
1717 std::is_same<RhsScalar, std::int16_t>::value) {
1718 // For now, support for int16 source types is limited to the
1719 // symmetric case (zero_point==0) because that appears to be
1720 // the case in the initial use cases, and that limits complexity
1721 // in thinking about accumulator overflows.
1722 // Setting use_specified_zero_points causes the default values 0 to be
1723 // used unless explicitly overridden.
1724 use_specified_zero_points = true;
1725 }
1726 if (!benchmark && !use_specified_zero_points) {
1727 MakeRandomScalar(RandomRange::kReasonableSrcZeroPoint, &lhs_zero_point);
1728 MakeRandomScalar(RandomRange::kReasonableSrcZeroPoint, &rhs_zero_point);
1729 // If destination is std::int32_t, no dst_zero_point is necessary.
1730 if (std::is_same<DstScalar, std::int32_t>::value) {
1731 dst_zero_point = 0;
1732 } else {
1733 MakeRandomScalar(RandomRange::kReasonableDstZeroPoint, &dst_zero_point);
1734 }
1735 }
1736 life_stage = LifeStage::kHasZeroPoints;
1737 }
1738
1739 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
1740 typename DstScalar>
1741 void TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::MakeLhsRhs() {
1742 RUY_CHECK_EQ(life_stage, LifeStage::kHasZeroPoints);
1743 MakeRandom(rows, depth, lhs_order, lhs_zero_point, layout_style,
1744 RandomRange::kOffCenterAvoidMinValue, &lhs);
1745 MakeRandom(depth, cols, rhs_order, rhs_zero_point, layout_style,
1746 RandomRange::kGeneral, &rhs);
1747 if (!benchmark) {
1748 cache_lhs = (global_random_engine()() & 0xf) == 0;
1749 cache_rhs = (global_random_engine()() & 0xf) == 0;
1750 }
1751 if (cache_lhs) {
1752 lhs.matrix.set_cache_policy(CachePolicy::kAlwaysCache);
1753 }
1754 if (cache_rhs) {
1755 rhs.matrix.set_cache_policy(CachePolicy::kAlwaysCache);
1756 }
1757 life_stage = LifeStage::kHasLhsRhs;
1758 }
1759
1760 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
1761 typename DstScalar>
1762 void TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::MakeMulParams() {
1763 RUY_CHECK_EQ(life_stage, LifeStage::kHasLhsRhs);
1764
1765 if (!getenv("BENCHMARK_ONLY_MATMUL") &&
1766 (benchmark || (global_random_engine()() & 1))) {
1767 MakeRandomVector(RandomRange::kBias, std::max(rows, cols), &bias_data);
1768 mul_params.set_bias(bias_data.data());
1769 }
1770 if (lhs.matrix.zero_point() == std::numeric_limits<LhsScalar>::lowest() &&
1771 rhs.matrix.zero_point() == std::numeric_limits<RhsScalar>::lowest()) {
1772 lhs.matrix.set_zero_point(lhs.matrix.zero_point() + 1);
1773 }
1774 MakeSpecMultiplierFieldsImpl<TestSet>::Run(this);
1775 MakeSpecClampFields(&mul_params);
1776 life_stage = LifeStage::kHasMulParams;
1777 }
1778
1779 inline int GetIntEnvVarOrZero(const char* name) {
1780 const char* val = getenv(name);
1781 if (!val) {
1782 return 0;
1783 }
1784 return std::stoi(val);
1785 }
1786
1787 inline float GetFloatEnvVarOrZero(const char* name) {
1788 const char* val = getenv(name);
1789 if (!val) {
1790 return 0;
1791 }
1792 return std::stof(val);
1793 }
1794
1795 inline int GetHexIntEnvVarOrZero(const char* name) {
1796 const char* val = getenv(name);
1797 if (!val) {
1798 return 0;
1799 }
1800 return std::stoi(val, nullptr, 16);
1801 }
1802
1803 inline bool GetBoolEnvVarOrFalse(const char* name) {
1804 return static_cast<bool>(GetIntEnvVarOrZero(name));
1805 }
1806
1807 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
1808 typename DstScalar>
1809 void TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::MakeOtherParams() {
1810 RUY_CHECK_EQ(life_stage, LifeStage::kHasMulParams);
1811 if (max_num_threads == 0) {
1812 max_num_threads = GetIntEnvVarOrZero("THREADS");
1813 }
1814 life_stage = LifeStage::kHasOtherParams;
1815 }
1816
1817 inline std::vector<Path> PathsBitfieldAsVector(Path paths_bitfield) {
1818 std::vector<Path> result;
1819 std::uint32_t remaining_paths = static_cast<std::uint32_t>(paths_bitfield);
1820 std::uint32_t test_bit = 1;
1821 while (remaining_paths) {
1822 if (remaining_paths & test_bit) {
1823 result.push_back(static_cast<Path>(test_bit));
1824 }
1825 remaining_paths &= ~test_bit;
1826 test_bit <<= 1;
1827 }
1828 return result;
1829 }
1830
1831 inline std::vector<Tuning> EnumerateTuningsForPath(Path path, bool benchmark) {
1832 if (benchmark) {
1833 return {Tuning::kAuto};
1834 }
1835 #if RUY_PLATFORM_ARM
1836 if (path == Path::kNeon || path == Path::kNeonDotprod) {
1837 return {Tuning::kA55ish, Tuning::kX1, Tuning::kGeneric, Tuning::kAuto};
1838 }
1839 #endif
1840 (void)path;
1841 return {Tuning::kAuto};
1842 }
1843
1844 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
1845 typename DstScalar>
1846 void TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::MakeResultPaths() {
1847 RUY_CHECK_EQ(life_stage, LifeStage::kHasOtherParams);
1848
1849 Path paths_bitfield = static_cast<Path>(GetHexIntEnvVarOrZero("PATHS"));
1850
1851 if (paths_bitfield == Path::kNone) {
1852 // Use a dummy Context just to perform the resolution of specific runtime
1853 // enabled paths.
1854 Context context;
1855 paths_bitfield = get_ctx(&context)->GetRuntimeEnabledPaths();
1856 }
1857
1858 // Disable the internal test-only variants of the StandardCpp path in
1859 // benchmarks
1860 if (benchmark) {
1861 paths_bitfield = paths_bitfield & kAllPaths;
1862 }
1863
1864 // Disable the internal test-only variants of the StandardCpp path on large
1865 // tests.
1866 // This constant be large enough to exercise some interesting BlockMap logic,
1867 // small enough to avoid large test latency increases from running these
1868 // slow code paths on large matrix multiplications.
1869 const int kMaxSizeToTestInternalStandardCppVariants = 300;
1870 if (rows > kMaxSizeToTestInternalStandardCppVariants ||
1871 cols > kMaxSizeToTestInternalStandardCppVariants ||
1872 depth > kMaxSizeToTestInternalStandardCppVariants) {
1873 paths_bitfield = paths_bitfield & kAllPaths;
1874 }
1875
1876 // Trim bits that don't correspond to a compiled path,
1877 // to allow specifying e.g. ffff to mean 'all paths' regardless of whether all
1878 // those bits exist as actual paths.
1879 paths_bitfield = paths_bitfield & kAllPathsIncludingInternalVariants;
1880 RUY_CHECK_NE(paths_bitfield, Path::kNone);
1881 paths = PathsBitfieldAsVector(paths_bitfield);
1882
1883 // kReference is a special 'external path' that's always available.
1884 // It can still be disabled by NOEXT.
1885 if (!GetBoolEnvVarOrFalse("NOEXT")) {
1886 external_paths.push_back(ExternalPath::kReference);
1887 }
1888
1889 #ifdef RUY_TEST_EXTERNAL_PATHS
1890
1891 if (!GetBoolEnvVarOrFalse("NOEXT")) {
1892 if (SupportsGemmlowp<TestSet>::kValue) {
1893 #ifdef GEMMLOWP_SSE4
1894 const bool gemmlowp_supported =
1895 !mul_params.multiplier_fixedpoint_perchannel() &&
1896 mul_params.channel_dimension() == ChannelDimension::kRow;
1897 #else
1898 const bool gemmlowp_supported =
1899 mul_params.channel_dimension() == ChannelDimension::kRow;
1900 #endif
1901 if (gemmlowp_supported) {
1902 external_paths.push_back(ExternalPath::kGemmlowp);
1903 }
1904 }
1905 if (UsesSingleScalarType<TestSet>::kValue &&
1906 std::is_floating_point<AccumScalar>::value) {
1907 external_paths.push_back(ExternalPath::kEigen);
1908 if (layout_style == LayoutStyle::kUnstridedLinear) {
1909 external_paths.push_back(ExternalPath::kEigenTensor);
1910 }
1911 // We link against a generic BLAS target that only maps to OpenBLAS on specific
1912 // architectures.
1913 #if RUY_PLATFORM_ARM_32 || RUY_PLATFORM_ARM_64
1914 // OpenBLAS multi-threading is disabled, so avoid mixing single-threaded
1915 // and multi-threaded benchmark results.
1916 if (max_num_threads == 1 && !getenv("NO_OPENBLAS")) {
1917 external_paths.push_back(ExternalPath::kOpenBlas);
1918 }
1919 #endif
1920 }
1921 }
1922
1923 #endif // RUY_TEST_EXTERNAL_PATHS
1924
1925 for (Path path : paths) {
1926 for (Tuning tuning : EnumerateTuningsForPath(path, benchmark)) {
1927 results.emplace_back(new TestResultType);
1928 TestResultType& result = *results.back();
1929 result.path = path;
1930 result.tuning = tuning;
1931 MakeRandom(rows, cols, dst_order, dst_zero_point, layout_style,
1932 RandomRange::kGeneral, &result.storage_matrix);
1933 }
1934 }
1935
1936 for (ExternalPath external_path : external_paths) {
1937 results.emplace_back(new TestResultType);
1938 TestResultType& result = *results.back();
1939 result.external_path = external_path;
1940 MakeRandom(rows, cols, dst_order, dst_zero_point, layout_style,
1941 RandomRange::kGeneral, &result.storage_matrix);
1942 }
1943
1944 life_stage = LifeStage::kHasResultPaths;
1945 }
1946
1947 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
1948 typename DstScalar>
1949 void TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::EvalResult(
1950 TestResult<DstScalar>* result) {
1951 RUY_CHECK(result->path != Path::kNone ||
1952 result->external_path != ExternalPath::kNone);
1953 if (result->path != Path::kNone) {
1954 EvalRuy(result);
1955 } else {
1956 EvalExternalPath(this, result);
1957 }
1958 const std::string& pathname = PathName(*result);
1959 if (std::find(CoveredPaths()->begin(), CoveredPaths()->end(), pathname) ==
1960 CoveredPaths()->end()) {
1961 CoveredPaths()->push_back(pathname);
1962 }
1963 }
1964
1965 using f32 = float;
1966 using f64 = double;
1967 using u8 = std::uint8_t;
1968 using i8 = std::int8_t;
1969 using u16 = std::uint16_t;
1970 using i16 = std::int16_t;
1971 using u32 = std::uint32_t;
1972 using i32 = std::int32_t;
1973 using u64 = std::uint64_t;
1974 using i64 = std::int64_t;
1975
1976 template <typename Scalar>
1977 const char* TypeName() {
1978 return nullptr;
1979 }
1980
1981 #define RUY_TYPENAME(TYPE) \
1982 template <> \
1983 inline const char* TypeName<TYPE>() { \
1984 return #TYPE; \
1985 }
1986
1987 RUY_TYPENAME(f32)
1988 RUY_TYPENAME(f64)
1989 RUY_TYPENAME(u8)
1990 RUY_TYPENAME(i8)
1991 RUY_TYPENAME(u16)
1992 RUY_TYPENAME(i16)
1993 RUY_TYPENAME(u32)
1994 RUY_TYPENAME(i32)
1995 RUY_TYPENAME(u64)
1996 RUY_TYPENAME(i64)
1997
1998 #undef RUY_TYPENAME
1999
2000 template <typename Scalar>
2001 const char* SymmetryName(const Matrix<Scalar>& matrix) {
2002 if (matrix.zero_point() == SymmetricZeroPoint<Scalar>()) {
2003 return "symm";
2004 } else {
2005 return "asymm";
2006 }
2007 }
2008
2009 template <typename Scalar>
2010 int StorageSize(const Matrix<Scalar>& matrix) {
2011 return sizeof(Scalar) * FlatSize(matrix.layout());
2012 }
2013
2014 // Helper that replicates a buffer and gives out pointers to the replicas.
2015 // This is useful when one wants to traverse data so that it is cold in cache.
2016 // By having a sufficiently large value of num_repeats, one can ensure that the
2017 // working set covered by the replicas is greater than the cache size.
2018 template <typename T>
2019 class RepeatedBuffer {
2020 public:
2021 RepeatedBuffer() = default;
2022 void Init(const T* elems, int num_elems, int num_repeats) {
2023 buffers_.clear();
2024 allocator_.FreeAll();
2025 for (int i = 0; i < num_repeats; i++) {
2026 T* p;
2027 allocator_.Allocate(num_elems, &p);
2028 memcpy(p, elems, num_elems * sizeof(T));
2029 buffers_.push_back(p);
2030 }
2031 }
2032 T* Next() {
2033 T* ret = buffers_[current_];
2034 current_ = (current_ + 1) % buffers_.size();
2035 return ret;
2036 }
2037
2038 private:
2039 Allocator allocator_;
2040 std::vector<T*> buffers_;
2041 int current_ = 0;
2042 };
2043
2044 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
2045 typename DstScalar>
2046 void TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::Benchmark(
2047 TestResult<DstScalar>* result) {
2048 const bool cold = getenv("RUY_BENCHMARK_COLD");
2049 LhsScalar* orig_lhs_data = lhs.matrix.data();
2050 RhsScalar* orig_rhs_data = rhs.matrix.data();
2051 DstScalar* orig_dst_data = result->storage_matrix.matrix.data();
2052
2053 int num_matmul_sets = 0;
2054
2055 RepeatedBuffer<LhsScalar> cold_lhs;
2056 RepeatedBuffer<RhsScalar> cold_rhs;
2057 RepeatedBuffer<DstScalar> cold_dst;
2058
2059 if (cold) {
2060 const int kWorkingSetSize = 100 << 20;
2061 const int each_matmul_set_size = StorageSize(lhs.matrix) +
2062 StorageSize(rhs.matrix) +
2063 StorageSize(result->storage_matrix.matrix);
2064 num_matmul_sets =
2065 (kWorkingSetSize + each_matmul_set_size - 1) / each_matmul_set_size;
2066
2067 cold_lhs.Init(lhs.matrix.data(), FlatSize(lhs.matrix.layout()),
2068 num_matmul_sets);
2069 cold_rhs.Init(rhs.matrix.data(), FlatSize(rhs.matrix.layout()),
2070 num_matmul_sets);
2071 cold_dst.Init(result->storage_matrix.matrix.data(),
2072 FlatSize(result->storage_matrix.matrix.layout()),
2073 num_matmul_sets);
2074 }
2075 const bool record_pmu = GetBoolEnvVarOrFalse("RUY_BENCHMARK_PMU");
2076 int repeats = GetIntEnvVarOrZero("RUY_BENCHMARK_REPEATS");
2077 if (!repeats) {
2078 repeats = 4;
2079 }
2080 float benchmark_min_secs = GetFloatEnvVarOrZero("RUY_BENCHMARK_MIN_SECS");
2081 if (!benchmark_min_secs) {
2082 benchmark_min_secs = 0.5;
2083 }
2084 #ifdef RUY_PROFILER
2085 {
2086 const char* lhstype = TypeName<LhsScalar>();
2087 const char* lhssymm = SymmetryName(lhs.matrix);
2088 const char* rhstype = TypeName<RhsScalar>();
2089 const char* rhssymm = SymmetryName(rhs.matrix);
2090
2091 printf("Profiling path=%s shape=(%dx%dx%d) lhs=(%s,%s) rhs=(%s,%s)\n",
2092 PathName(*result).c_str(), rows, depth, cols, lhstype, lhssymm,
2093 rhstype, rhssymm);
2094 ruy::profiler::ScopeProfile profile;
2095 #endif
2096
2097 float latency = std::numeric_limits<float>::infinity();
2098 float l1_refill_rate = std::numeric_limits<float>::infinity();
2099 float l2_refill_rate = std::numeric_limits<float>::infinity();
2100 float l3_refill_rate = std::numeric_limits<float>::infinity();
2101 float l1tlb_refill_rate = std::numeric_limits<float>::infinity();
2102 float l2tlb_refill_rate = std::numeric_limits<float>::infinity();
2103 float mispred_rate = std::numeric_limits<float>::infinity();
2104 float frontend_stall_rate = std::numeric_limits<float>::infinity();
2105 float backend_stall_rate = std::numeric_limits<float>::infinity();
2106
2107 for (int repeat = 0; repeat < repeats; repeat++) {
2108 auto& pmu_events = GlobalPmuEvents();
2109 if (record_pmu) {
2110 pmu_events.StartRecording();
2111 }
2112 TimePoint time_start = Now();
2113 TimePoint t = time_start;
2114 int iters = 0;
2115 int iters_at_a_time = 1;
2116 while (ToFloatSeconds(t - time_start) < benchmark_min_secs) {
2117 for (int i = 0; i < iters_at_a_time; i++) {
2118 if (cold) {
2119 lhs.matrix.set_data(cold_lhs.Next());
2120 rhs.matrix.set_data(cold_rhs.Next());
2121 result->storage_matrix.matrix.set_data(cold_dst.Next());
2122 }
2123 EvalResult(result);
2124 iters++;
2125 }
2126 iters_at_a_time *= 2;
2127 t = Now();
2128 }
2129 latency = std::min(
2130 latency, static_cast<float>(ToFloatSeconds(t - time_start) / iters));
2131 if (record_pmu) {
2132 pmu_events.StopRecording();
2133 const float normalization_factor =
2134 1.0f / (static_cast<float>(iters) * rows * cols * depth);
2135 l1_refill_rate = std::min(
2136 l1_refill_rate, pmu_events.L1RefillCount() * normalization_factor);
2137 l2_refill_rate = std::min(
2138 l2_refill_rate, pmu_events.L2RefillCount() * normalization_factor);
2139 l3_refill_rate = std::min(
2140 l3_refill_rate, pmu_events.L3RefillCount() * normalization_factor);
2141 l1tlb_refill_rate =
2142 std::min(l1tlb_refill_rate,
2143 pmu_events.L1TLBRefillCount() * normalization_factor);
2144 l2tlb_refill_rate =
2145 std::min(l2tlb_refill_rate,
2146 pmu_events.L2TLBRefillCount() * normalization_factor);
2147 mispred_rate =
2148 std::min(mispred_rate, pmu_events.BranchMispredictionCount() *
2149 normalization_factor);
2150 frontend_stall_rate =
2151 std::min(frontend_stall_rate,
2152 pmu_events.FrontendStallCount() * normalization_factor);
2153 backend_stall_rate =
2154 std::min(backend_stall_rate,
2155 pmu_events.BackendStallCount() * normalization_factor);
2156 }
2157 }
2158 result->latency = latency;
2159 if (record_pmu) {
2160 result->l1_refill_rate = l1_refill_rate;
2161 result->l2_refill_rate = l2_refill_rate;
2162 result->l3_refill_rate = l3_refill_rate;
2163 result->l1tlb_refill_rate = l1tlb_refill_rate;
2164 result->l2tlb_refill_rate = l2tlb_refill_rate;
2165 result->mispred_rate = mispred_rate;
2166 result->frontend_stall_rate = frontend_stall_rate;
2167 result->backend_stall_rate = backend_stall_rate;
2168 }
2169
2170 #ifdef RUY_PROFILER
2171 }
2172 fflush(stdout);
2173 #endif
2174
2175 if (cold) {
2176 lhs.matrix.set_data(orig_lhs_data);
2177 rhs.matrix.set_data(orig_rhs_data);
2178 memcpy(orig_dst_data, result->storage_matrix.matrix.data(),
2179 StorageSize(result->storage_matrix.matrix));
2180 result->storage_matrix.matrix.set_data(orig_dst_data);
2181 }
2182 }
2183
2184 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
2185 typename DstScalar>
2186 void TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::Eval() {
2187 RUY_CHECK_EQ(life_stage, LifeStage::kHasResultPaths);
2188 for (auto& result : results) {
2189 if (benchmark) {
2190 Benchmark(result.get());
2191 } else {
2192 EvalResult(result.get());
2193 }
2194 }
2195 life_stage = LifeStage::kEvaluated;
2196 }
2197
2198 template <typename Scalar>
2199 std::string DumpRegion(const Matrix<Scalar>& matrix, int center_row,
2200 int center_col) {
2201 static constexpr int kRadius = 20;
2202 int first_row = std::max(0, center_row - kRadius);
2203 int last_row = std::min(matrix.layout().rows() - 1, center_row + kRadius);
2204 int first_col = std::max(0, center_col - kRadius);
2205 int last_col = std::min(matrix.layout().cols() - 1, center_col + kRadius);
2206 std::ostringstream stream;
2207 for (int row = first_row; row <= last_row; row++) {
2208 for (int col = first_col; col <= last_col; col++) {
2209 stream << static_cast<double>(Element(matrix, row, col)) << " ";
2210 }
2211 stream << "\n";
2212 }
2213 return stream.str();
2214 }
2215
2216 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
2217 typename DstScalar>
2218 void TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::VerifyTestResults()
2219 const {
2220 const int depth = lhs.matrix.layout().cols();
2221 for (int i = 0; i < static_cast<int>(results.size()) - 1; i++) {
2222 if (!Agree(*results[i], *results[i + 1], depth)) {
2223 PrintPathsInAgreement(results, depth);
2224 ErrorAnalysis error_analysis;
2225 AnalyzeTestError(*this, i + 1, &error_analysis);
2226 std::cerr << "Shape: rows = " << rows << ", cols = " << cols
2227 << ", depth = " << depth << std::endl;
2228 std::cerr << "Stats of the good result matrix: "
2229 << StatsAsString(error_analysis.stats_good) << std::endl;
2230 std::cerr << "Stats of the bad result matrix: "
2231 << StatsAsString(error_analysis.stats_bad) << std::endl;
2232 if (static_cast<int>(error_analysis.error_rows.size()) < rows) {
2233 std::cerr << "Rows containing errors: "
2234 << Join(error_analysis.error_rows) << std::endl;
2235 } else {
2236 std::cerr << "Errors found in ALL rows." << std::endl;
2237 }
2238 if (static_cast<int>(error_analysis.error_cols.size()) < cols) {
2239 std::cerr << "Cols containing errors: "
2240 << Join(error_analysis.error_cols) << std::endl;
2241 } else {
2242 std::cerr << "Errors found in ALL cols." << std::endl;
2243 }
2244 std::cerr << "The first error occurs at row "
2245 << error_analysis.row_of_first_error << ", col "
2246 << error_analysis.col_of_first_error << std::endl;
2247 std::cerr << "Good value: " << error_analysis.first_error_good_value
2248 << std::endl;
2249 std::cerr << "Bad value : " << error_analysis.first_error_bad_value
2250 << std::endl;
2251 std::cerr << "Region of Good result matrix around first error:\n\n"
2252 << DumpRegion(results[0]->storage_matrix.matrix,
2253 error_analysis.row_of_first_error,
2254 error_analysis.col_of_first_error)
2255 << std::endl;
2256 std::cerr << "Region of Bad result matrix around first error:\n\n"
2257 << DumpRegion(results[i + 1]->storage_matrix.matrix,
2258 error_analysis.row_of_first_error,
2259 error_analysis.col_of_first_error)
2260 << std::endl;
2261 RUY_CHECK(false);
2262 }
2263 }
2264 }
2265
2266 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
2267 typename DstScalar>
2268 void TestSet<LhsScalar, RhsScalar, AccumScalar, DstScalar>::Verify() {
2269 RUY_CHECK_EQ(life_stage, LifeStage::kEvaluated);
2270 VerifyTestResults();
2271 life_stage = LifeStage::kFinal;
2272 }
2273
2274 template <typename TestSetType>
2275 void TestRCC(int rows, int depth, int cols) {
2276 TestSetType test_set;
2277 test_set.rows = rows;
2278 test_set.depth = depth;
2279 test_set.cols = cols;
2280 test_set.lhs_order = Order::kRowMajor;
2281 test_set.rhs_order = Order::kColMajor;
2282 test_set.dst_order = Order::kColMajor;
2283 test_set.layout_style = LayoutStyle::kUnstridedLinear;
2284 test_set.Run();
2285 }
2286
2287 template <typename TestSetType>
2288 void TestNonRCC(int rows, int depth, int cols) {
2289 TestSetType test_set;
2290 test_set.rows = rows;
2291 test_set.depth = depth;
2292 test_set.cols = cols;
2293 test_set.lhs_order = Order::kColMajor;
2294 test_set.rhs_order = Order::kColMajor;
2295 test_set.dst_order = Order::kColMajor;
2296 test_set.layout_style = LayoutStyle::kUnstridedLinear;
2297 test_set.Run();
2298 }
2299
2300 template <typename TestSetType>
2301 void TestLinearAllOrders(int rows, int depth, int cols) {
2302 const std::vector<Order> orders{Order::kColMajor, Order::kRowMajor};
2303
2304 for (Order lhs_order : orders) {
2305 for (Order rhs_order : orders) {
2306 for (Order dst_order : orders) {
2307 TestSetType test_set;
2308 test_set.rows = rows;
2309 test_set.depth = depth;
2310 test_set.cols = cols;
2311 test_set.lhs_order = lhs_order;
2312 test_set.rhs_order = rhs_order;
2313 test_set.dst_order = dst_order;
2314 test_set.layout_style = LayoutStyle::kLinear;
2315 test_set.Run();
2316 }
2317 }
2318 }
2319 }
2320
2321 } // namespace ruy
2322
2323 #endif // RUY_RUY_TEST_H_
2324