• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Example command line to build on Android ARM64:
2 /*
3 ~/android/toolchains/r15c-aarch64/bin/aarch64-linux-android-clang++ \
4 test/benchmark_all_sizes.cc -o /tmp/b -O3 --std=c++11 -fPIE -static \
5 -DBENCHMARK_QUICK -DBENCHMARK_8bit
6 */
7 
8 #include <algorithm>
9 #include <cmath>
10 #include <cstdint>
11 #include <ctime>
12 #include <iostream>
13 #include <map>
14 #include <random>
15 #include <set>
16 
17 #include "../public/gemmlowp.h"
18 
19 #ifdef GEMMLOWP_PROFILING
20 #include "../profiling/profiler.h"
21 #endif
22 
23 #if defined GEMMLOWP_ANDROID && defined GEMMLOWP_ARM_32
24 // Compilation workaround
25 namespace std {
26   using ::round;
27 }
28 #endif
29 
30 // Minimum duration of each benchmark measurement. Also, duration
31 // of sleep time between each two consecutive benchmark measurements to
32 // prevent over-heating.
33 const double kBenchmarkSecs = 0.1;
34 
35 // Sleep time before each benchmark.
36 const int kCooldownBeforeBenchmarkSecs = 0;
37 
38 // Number of benchmark passes.
39 const int kPasses = 4;
40 
41 #ifdef BENCHMARK_NUM_THREADS
42 const int kNumThreads = BENCHMARK_NUM_THREADS;
43 #else
44 const int kNumThreads = 1;
45 #endif
46 
47 namespace gemmlowp {
48 
49 // gemmlowp itself doesn't have a Matrix class, only a MatrixMap class,
50 // since it only maps existing data. In tests though, we need to
51 // create our own matrices.
52 template <typename tScalar, MapOrder tOrder>
53 class Matrix : public MatrixMap<tScalar, tOrder> {
54  public:
55   typedef MatrixMap<tScalar, tOrder> Map;
56   typedef MatrixMap<const tScalar, tOrder> ConstMap;
57   typedef typename Map::Scalar Scalar;
58   static const MapOrder Order = tOrder;
59   using Map::cols_;
60   using Map::data_;
61   using Map::kOrder;
62   using Map::rows_;
63   using Map::stride_;
64 
65  public:
Matrix()66   Matrix() : Map(nullptr, 0, 0, 0) {}
67 
Matrix(int rows,int cols)68   Matrix(int rows, int cols) : Map(nullptr, 0, 0, 0) { Resize(rows, cols); }
69 
Matrix(const Matrix & other)70   Matrix(const Matrix& other) : Map(nullptr, 0, 0, 0) { *this = other; }
71 
operator =(const Matrix & other)72   Matrix& operator=(const Matrix& other) {
73     Resize(other.rows_, other.cols_);
74     std::memcpy(data_, other.data_, size() * sizeof(Scalar));
75     return *this;
76   }
77 
operator ==(const Matrix & a,const Matrix & b)78   friend bool operator==(const Matrix& a, const Matrix& b) {
79     return a.rows_ == b.rows_ && a.cols_ == b.cols_ &&
80            !std::memcmp(a.data_, b.data_, a.size());
81   }
82 
Resize(int rows,int cols)83   void Resize(int rows, int cols) {
84     rows_ = rows;
85     cols_ = cols;
86     stride_ = kOrder == MapOrder::ColMajor ? rows : cols;
87     storage.resize(size());
88     data_ = storage.data();
89   }
90 
size() const91   int size() const { return rows_ * cols_; }
92 
map()93   Map& map() { return *static_cast<Map*>(this); }
94 
const_map() const95   ConstMap const_map() const { return ConstMap(data_, rows_, cols_, stride_); }
96 
97  protected:
98   std::vector<Scalar> storage;
99 };
100 
101 template <typename MatrixType>
MakeZero(MatrixType * m)102 void MakeZero(MatrixType* m) {
103   for (int c = 0; c < m->cols(); c++) {
104     for (int r = 0; r < m->rows(); r++) {
105       (*m)(r, c) = 128;
106     }
107   }
108 }
109 
110 }  // end namespace gemmlowp
111 
112 template <typename BitDepthParams>
benchmark_8bit(int rows,int depth,int cols)113 float benchmark_8bit(int rows, int depth, int cols) {
114   using namespace gemmlowp;
115   typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
116   typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
117   typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
118 
119   LhsType lhs;
120   RhsType rhs;
121   ResultType result;
122   lhs.Resize(rows, depth);
123   rhs.Resize(depth, cols);
124   result.Resize(rows, cols);
125   MakeZero(&lhs);
126   MakeZero(&rhs);
127   MakeZero(&result);
128 
129   typedef std::tuple<OutputStageQuantizeDownInt32ByFixedPoint,
130                      OutputStageSaturatingCastToUint8>
131       Pipeline;
132   gemmlowp::OutputStageQuantizeDownInt32ByFixedPoint
133       quantize_down_stage;
134   quantize_down_stage.result_offset_after_shift = 128;
135   quantize_down_stage.result_fixedpoint_multiplier = 1234567890;
136   quantize_down_stage.result_shift = 16;
137   gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
138   const auto output_pipeline =
139       std::make_tuple(quantize_down_stage, saturating_cast_stage);
140   GemmContext gemm_context;
141   gemm_context.set_max_num_threads(kNumThreads);
142   gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, BitDepthParams>(
143       &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
144       -128, output_pipeline);
145 
146   double time_start = real_time_in_seconds();
147   double t = time_start;
148   int iters = 0;
149   int iters_at_a_time = 1;
150   while (t - time_start < kBenchmarkSecs) {
151     for (int i = 0; i < iters_at_a_time; i++) {
152       gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t,
153                                        BitDepthParams>(
154           &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
155           -128, output_pipeline);
156       iters++;
157     }
158     iters_at_a_time *= 2;
159     t = real_time_in_seconds();
160   }
161   return (t - time_start) / iters;
162 }
163 
164 template <typename BitDepthParams>
benchmark_8bit_to_32bit(int rows,int depth,int cols)165 float benchmark_8bit_to_32bit(int rows, int depth, int cols) {
166   using namespace gemmlowp;
167   typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
168   typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
169   typedef Matrix<std::int32_t, MapOrder::ColMajor> ResultType;
170 
171   LhsType lhs;
172   RhsType rhs;
173   ResultType result;
174   lhs.Resize(rows, depth);
175   rhs.Resize(depth, cols);
176   result.Resize(rows, cols);
177   MakeZero(&lhs);
178   MakeZero(&rhs);
179   MakeZero(&result);
180 
181   typedef std::tuple<> EmptyPipeline;
182   GemmContext gemm_context;
183   gemm_context.set_max_num_threads(kNumThreads);
184   gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, BitDepthParams>(
185       &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
186       -128, EmptyPipeline());
187 
188   double time_start = real_time_in_seconds();
189   double t = time_start;
190   int iters = 0;
191   int iters_at_a_time = 1;
192   while (t - time_start < kBenchmarkSecs) {
193     for (int i = 0; i < iters_at_a_time; i++) {
194       gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t,
195                                        BitDepthParams>(
196           &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
197           -128, EmptyPipeline());
198       iters++;
199     }
200     iters_at_a_time *= 2;
201     t = real_time_in_seconds();
202   }
203   return (t - time_start) / iters;
204 }
205 
206 struct Shape {
207   int rows;
208   int depth;
209   int cols;
210 };
211 
operator ==(const Shape & s1,const Shape & s2)212 bool operator==(const Shape& s1, const Shape& s2) {
213   return s1.rows == s2.rows && s1.depth == s2.depth && s1.cols == s2.cols;
214 }
215 
operator <(const Shape & shape1,const Shape & shape2)216 bool operator<(const Shape& shape1, const Shape& shape2) {
217   return shape1.depth < shape2.depth ||
218          (shape1.depth == shape2.depth &&
219           (shape1.rows < shape2.rows ||
220            (shape1.rows == shape2.rows && shape1.cols < shape2.cols)));
221 };
222 
223 #ifdef _WIN32
224 #define sleep(t) Sleep(t)
225 #endif
226 
benchmark(const Shape & shape)227 float benchmark(const Shape& shape) {
228   if (kCooldownBeforeBenchmarkSecs) {
229     sleep(kCooldownBeforeBenchmarkSecs);
230   }
231 #if defined BENCHMARK_8bit
232   // Benchmark the fast 8bit path, using L8R8WithLhsNonzeroBitDepthParams.
233   // This is the recommended thing to default to: it's what most applications
234   // want to use, as it's the fastest.
235   // The contract is that LHS must take values in [1, 255], while RHS can take
236   // any value in [0, 255].
237   return benchmark_8bit<gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
238       shape.rows, shape.depth, shape.cols);
239 #elif defined BENCHMARK_8bit_wide
240   // Variant benchmarking the slower (mostly legacy) DefaultL8R8BitDepthParams.
241   // The only contract difference is that both LHS and RHS can take values in
242   // [0, 255].
243   return benchmark_8bit<gemmlowp::DefaultL8R8BitDepthParams>(
244       shape.rows, shape.depth, shape.cols);
245 #elif defined BENCHMARK_8bit_to_32bit
246   // Variant of BENCHMARK_8bit where the user asks for getting raw int32
247   // accumulators, instead of a 8bit-downscaled result.
248   return benchmark_8bit_to_32bit<gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
249       shape.rows, shape.depth, shape.cols);
250 #elif defined BENCHMARK_8bit_to_32bit_wide
251   // Variant of BENCHMARK_8bit_wide where the user asks for getting raw int32
252   // accumulators, instead of a 8bit-downscaled result.
253   return benchmark_8bit_to_32bit<gemmlowp::DefaultL8R8BitDepthParams>(
254       shape.rows, shape.depth, shape.cols);
255 #elif defined BENCHMARK_float
256   return benchmark_float(shape.rows, shape.depth, shape.cols);
257 #else
258 #error What arithmetic path should we benchmark? (Suggestion: #define BENCHMARK_8bit)
259 #endif
260 }
261 
all_sizes()262 std::set<int> all_sizes() {
263   std::set<int> sizes;
264   for (int i = 1; i <= 2048; i *= 2) {
265     sizes.insert(i);
266   }
267   for (double x = 8; x <= 2048; x *= std::sqrt(2.)) {
268     sizes.insert(static_cast<int>(std::round(x)));
269   }
270   for (double x = 16; x <= 512; x *= std::pow(2., 1. / 4.)) {
271     sizes.insert(static_cast<int>(std::round(x)));
272   }
273   return sizes;
274 }
275 
RandomEngine()276 std::mt19937& RandomEngine() {
277   static std::mt19937 engine;
278   return engine;
279 }
280 
all_shapes_in_random_order()281 std::vector<Shape> all_shapes_in_random_order() {
282   std::vector<Shape> shapes;
283   const std::set<int> sizes = all_sizes();
284 #if defined BENCHMARK_ROWS
285   // Benchmark one specific shape
286   Shape shape;
287   shape.rows = BENCHMARK_ROWS;
288   shape.depth = BENCHMARK_DEPTH;
289   shape.cols = BENCHMARK_COLS;
290   shapes.push_back(shape);
291 #elif defined BENCHMARK_QUICK
292   // Benchmark an assortment of cubic shapes
293   for (int size : sizes) {
294     Shape shape;
295     shape.rows = size;
296     shape.depth = size;
297     shape.cols = size;
298     shapes.push_back(shape);
299   }
300 #elif defined BENCHMARK_EXHAUSTIVE
301   // Benchmark all sorts of shapes
302   for (int rows : sizes) {
303     for (int depth : sizes) {
304       for (int cols : sizes) {
305         Shape shape;
306         shape.rows = rows;
307         shape.depth = depth;
308         shape.cols = cols;
309         shapes.push_back(shape);
310       }
311     }
312   }
313 #else
314 #error What shapes should we benchmark? (Suggestion: #define BENCHMARK_QUICK)
315 #endif
316   std::shuffle(std::begin(shapes), std::end(shapes), RandomEngine());
317   return shapes;
318 }
319 
run_benchmarks(std::map<Shape,float> * results)320 void run_benchmarks(std::map<Shape, float>* results) {
321   std::vector<Shape> shapes;
322   for (int pass = 0; pass < kPasses; pass++) {
323     const std::vector<Shape> pass_shapes = all_shapes_in_random_order();
324     shapes.insert(std::end(shapes), std::begin(pass_shapes),
325                   std::end(pass_shapes));
326   }
327 
328   const double time_start = gemmlowp::real_time_in_seconds();
329   for (std::size_t i = 0; i < shapes.size(); i++) {
330     const double ratio = static_cast<double>(i) / shapes.size();
331     const double elapsed = gemmlowp::real_time_in_seconds() - time_start;
332     const double elapsed_hours = elapsed / 3600.;
333     const double eta_hours = elapsed_hours * (1. - ratio) / ratio;
334     fprintf(stderr,
335             "Benchmarking: %.2f%% done, Elapsed: %.2f hours, ETA: %.2f "
336             "hours...   \r",
337             100. * ratio, elapsed_hours, eta_hours);
338     fflush(stderr);
339     const Shape& shape = shapes[i];
340     float latency = benchmark(shape);
341     if (results->count(shape)) {
342       (*results)[shape] = std::min(latency, (*results)[shape]);
343     } else {
344       (*results)[shape] = latency;
345     }
346   }
347   fprintf(stderr, "\n");
348 }
349 
main()350 int main() {
351   std::map<Shape, float> results;
352 
353 #ifdef GEMMLOWP_PROFILING
354   gemmlowp::RegisterCurrentThreadForProfiling();
355   gemmlowp::StartProfiling();
356 #endif
357 
358   run_benchmarks(&results);
359 
360 #ifdef GEMMLOWP_PROFILING
361   gemmlowp::FinishProfiling();
362 #endif
363 
364   printf("Using %d thread(s)\n", kNumThreads);
365   printf("depth,rows,cols,latency(s),Gop/s\n");
366   for (const auto& result : results) {
367     const Shape& shape = result.first;
368     printf("%d,%d,%d,%.4g,%.4g\n", shape.depth, shape.rows, shape.cols,
369            result.second,
370            2e-9 * shape.depth * shape.rows * shape.cols / result.second);
371   }
372 }
373