• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include <unistd.h>
16 #ifdef __APPLE__
17 #include <sys/time.h>
18 #endif
19 
20 #include <cstdint>
21 #include <cstdlib>
22 #include <ctime>
23 #include <iostream>
24 #include <map>
25 #include <vector>
26 #ifdef __APPLE__
27 #include <TargetConditionals.h>
28 #endif
29 
30 #include "test.h"
31 
32 #ifndef GEMMLOWP_TEST_BIT_DEPTH_PARAMS
33 #define GEMMLOWP_TEST_BIT_DEPTH_PARAMS DefaultL8R8BitDepthParams
34 #endif
35 
36 #if defined(__arm__) && !defined(GEMMLOWP_NEON)
37 #warning "Building without NEON support on ARM, check your compiler setup!"
38 #endif
39 
40 #if defined(__SSE4_2__) && !defined(GEMMLOWP_SSE4)
41 #warning \
42     "Building without SSE4.2 support on SSE4.2 enabled machine, check your compiler setup!"
43 #endif
44 
45 namespace gemmlowp {
46 
time()47 double time() {
48 #ifdef __APPLE__
49   timeval t;
50   gettimeofday(&t, nullptr);
51   return t.tv_sec + 1e-6 * t.tv_usec;
52 #else
53   timespec t;
54   clock_gettime(CLOCK_REALTIME, &t);
55   return t.tv_sec + 1e-9 * t.tv_nsec;
56 #endif
57 }
58 
59 const double min_accurate_duration = 1e-1;
60 const std::size_t min_working_set_size = 16 * 1024 * 1024;
61 
62 struct gemm_t {
63   int rows, depth, cols;
gemm_tgemmlowp::gemm_t64   gemm_t() : rows(0), depth(0), cols(0) {}
gemm_tgemmlowp::gemm_t65   gemm_t(int r, int d, int c) : rows(r), depth(d), cols(c) {}
66 };
67 
operator <(const gemm_t & a,const gemm_t & b)68 bool operator<(const gemm_t& a, const gemm_t& b) {
69   return a.rows < b.rows ||
70          (a.rows <= b.rows &&
71           (a.depth < b.depth || (a.depth <= b.depth && (a.cols < b.cols))));
72 }
73 
74 template <typename LhsType, typename RhsType, typename ResultType>
time_for_gemms(GemmContext * context,const std::vector<gemm_t> & gemms)75 double time_for_gemms(GemmContext* context, const std::vector<gemm_t>& gemms) {
76   typedef std::uint8_t Scalar;
77 
78   // set up the matrix pool
79 
80   std::size_t combined_gemm_sizes = 0;
81   for (auto gemm : gemms) {
82     int rows = gemm.rows;
83     int depth = gemm.depth;
84     int cols = gemm.cols;
85     combined_gemm_sizes +=
86         sizeof(Scalar) * (rows * depth + depth * cols + rows * cols);
87   }
88 
89   const std::size_t pool_size = 1 + min_working_set_size / combined_gemm_sizes;
90 
91   std::vector<LhsType> lhs(pool_size * gemms.size());
92   std::vector<RhsType> rhs(pool_size * gemms.size());
93   std::vector<ResultType> result(pool_size * gemms.size());
94 
95   for (std::size_t i = 0; i < pool_size; i++) {
96     for (std::size_t j = 0; j < gemms.size(); j++) {
97       int k = i * gemms.size() + j;
98       lhs[k].Resize(gemms[j].rows, gemms[j].depth);
99       MakeConstant(&lhs[k], 0);
100       rhs[k].Resize(gemms[j].depth, gemms[j].cols);
101       MakeConstant(&rhs[k], 0);
102       result[k].Resize(gemms[j].rows, gemms[j].cols);
103       MakeConstant(&result[k], 0);
104     }
105   }
106 
107   // main benchmark loop
108 
109   int iters_at_a_time = 1;
110   float time_per_iter = 0.0f;
111   std::size_t pool_index = 0;
112 
113   while (true) {
114     double starttime = time();
115     for (int i = 0; i < iters_at_a_time; i++) {
116       for (size_t j = 0; j < gemms.size(); j++) {
117         int k = pool_index * gemms.size() + j;
118         Gemm<std::uint8_t, GEMMLOWP_TEST_BIT_DEPTH_PARAMS>(
119             context, lhs[k].const_map(), rhs[k].const_map(), &result[k].map(),
120             -75, -91, 74980, 123, 20);
121       }
122       pool_index++;
123       if (pool_index == pool_size) {
124         pool_index = 0;
125       }
126     }
127     double endtime = time();
128 
129     const float timing = static_cast<float>(endtime - starttime);
130 
131     if (timing >= min_accurate_duration) {
132       time_per_iter = timing / iters_at_a_time;
133       break;
134     }
135 
136     iters_at_a_time *= 2;
137   }
138 
139   return time_per_iter;
140 }
141 
142 template <typename LhsType, typename RhsType, typename ResultType>
gflops_for_gemms(GemmContext * context,const std::vector<gemm_t> & gemms)143 double gflops_for_gemms(GemmContext* context,
144                         const std::vector<gemm_t>& gemms) {
145   const double time_per_iter =
146       time_for_gemms<LhsType, RhsType, ResultType>(context, gemms);
147   double ops = 0;
148   for (auto gemm : gemms) {
149     ops += 2.0 * gemm.rows * gemm.depth * gemm.cols;
150   }
151   return 1e-9 * ops / time_per_iter;
152 }
153 
benchmark(GemmContext * context)154 void benchmark(GemmContext* context) {
155   std::map<gemm_t, std::vector<double>> benchmark_results;
156 
157   std::vector<gemm_t> benchmark_gemms;
158   benchmark_gemms.emplace_back(10, 10, 10);
159   benchmark_gemms.emplace_back(20, 20, 20);
160   benchmark_gemms.emplace_back(30, 30, 30);
161   benchmark_gemms.emplace_back(40, 40, 40);
162   benchmark_gemms.emplace_back(50, 50, 50);
163   benchmark_gemms.emplace_back(60, 60, 60);
164   benchmark_gemms.emplace_back(64, 256, 147);
165   benchmark_gemms.emplace_back(100, 100, 1);
166   benchmark_gemms.emplace_back(100, 100, 100);
167   benchmark_gemms.emplace_back(100, 1000, 100);
168   benchmark_gemms.emplace_back(1000, 1000, 1);
169   benchmark_gemms.emplace_back(1000, 1000, 10);
170   benchmark_gemms.emplace_back(1000, 1000, 100);
171   benchmark_gemms.emplace_back(1000, 1000, 1000);
172 
173   const int repeat = 2;
174 
175   typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
176   typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
177   typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
178 
179 #ifdef GEMMLOWP_TEST_PROFILE
180   gemmlowp::RegisterCurrentThreadForProfiling();
181   gemmlowp::StartProfiling();
182 #endif
183 
184   // We don't record the first repetition, it's just warm-up.
185   for (int r = 0; r < repeat + 1; r++) {
186     std::cout << "repetition " << r + 1 << "/" << repeat + 1 << "...\r"
187               << std::flush;
188     for (auto gemm : benchmark_gemms) {
189       double gflops = 0;
190       std::vector<gemm_t> unique_gemm;
191       unique_gemm.push_back(gemm);
192       gflops =
193           gflops_for_gemms<LhsType, RhsType, ResultType>(context, unique_gemm);
194       if (r > 0) {
195         benchmark_results[gemm].emplace_back(gflops);
196       }
197     }
198   }
199 
200 #ifdef GEMMLOWP_TEST_PROFILE
201   gemmlowp::FinishProfiling();
202 #endif
203 
204   std::cout << "                                                \r"
205             << std::flush;
206 
207   std::cout.precision(4);
208 
209   for (auto b : benchmark_results) {
210     sort(b.second.begin(), b.second.end());
211     std::cout << b.first.rows << "x" << b.first.depth << "x" << b.first.cols
212               << " : " << b.second.back() << " GFlops/s" << std::endl;
213   }
214   std::cout << std::endl;
215 }
216 
benchmark_gemm_sizes(GemmContext * context,const std::vector<gemm_t> & gemms,double mintime)217 void benchmark_gemm_sizes(GemmContext* context,
218                           const std::vector<gemm_t>& gemms, double mintime) {
219   typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
220   typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
221   typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
222 
223   std::vector<float> gemm_times;
224   std::cout << "running for " << mintime << " seconds..." << std::endl;
225 
226 #ifdef GEMMLOWP_TEST_PROFILE
227   gemmlowp::RegisterCurrentThreadForProfiling();
228   gemmlowp::StartProfiling();
229 #endif
230 
231   double starttime = time();
232   while (time() < starttime + mintime) {
233     gemm_times.push_back(
234         time_for_gemms<LhsType, RhsType, ResultType>(context, gemms));
235   }
236 
237 #ifdef GEMMLOWP_TEST_PROFILE
238   gemmlowp::FinishProfiling();
239 #endif
240 
241   std::sort(gemm_times.begin(), gemm_times.end());
242 
243   double sum_gemm_times = 0;
244   double sum_gemm_times_trimmed = 0;
245   int count_gemm_times_trimmed = 0;
246   const float trim_ratio = 0.25;
247   const size_t count_trimmed = gemm_times.size() * trim_ratio;
248   double sum_gemm_times_best = 0;
249   int count_gemm_times_best = 0;
250   const float best_ratio = 0.1;
251   const size_t count_best = gemm_times.size() * best_ratio;
252 
253   for (size_t i = 0; i < gemm_times.size(); i++) {
254     sum_gemm_times += gemm_times[i];
255     if (i >= count_trimmed && i < gemm_times.size() - count_trimmed) {
256       sum_gemm_times_trimmed += gemm_times[i];
257       count_gemm_times_trimmed++;
258     }
259     if (i < count_best) {
260       sum_gemm_times_best += gemm_times[i];
261       count_gemm_times_best++;
262     }
263   }
264 
265   const double min_latency = gemm_times.front();
266   const double max_latency = gemm_times.back();
267   const double mean_latency = sum_gemm_times / gemm_times.size();
268   const double trimmed_mean_latency =
269       sum_gemm_times_trimmed / count_gemm_times_trimmed;
270   const double best_mean_latency = sum_gemm_times_best / count_gemm_times_best;
271 
272   std::cout << "Graph latency (over " << gemm_times.size()
273             << " iterations):" << std::endl;
274   std::cout << "  Best:             " << min_latency << "s" << std::endl;
275   std::cout << "  Worst:            " << max_latency << "s" << std::endl;
276   std::cout << "  Mean:             " << mean_latency << "s" << std::endl;
277   std::cout << "  " << 100 * trim_ratio
278             << "% trimmed mean: " << trimmed_mean_latency << "s" << std::endl;
279   std::cout << "  Mean of " << 100 * best_ratio
280             << "% best: " << best_mean_latency << "s" << std::endl;
281 }
282 
benchmark_googlenet(GemmContext * context)283 void benchmark_googlenet(GemmContext* context) {
284   // These are the m, n, k sizes for a typical GoogLeNet.
285   const int googlenet_gemm_sizes[] = {
286       12544, 64,  147, 3136, 64,   64,   3136, 192,  576,  784, 64,   192,
287       784,   96,  192, 784,  128,  864,  784,  16,   192,  784, 32,   400,
288       784,   32,  192, 784,  128,  256,  784,  128,  256,  784, 192,  1152,
289       784,   32,  256, 784,  96,   800,  784,  64,   256,  196, 192,  480,
290       196,   96,  480, 196,  204,  864,  196,  16,   480,  196, 48,   400,
291       196,   64,  480, 196,  160,  508,  196,  112,  508,  196, 224,  1008,
292       196,   24,  508, 196,  64,   600,  196,  64,   508,  196, 128,  512,
293       196,   128, 512, 196,  256,  1152, 196,  24,   512,  196, 64,   600,
294       196,   64,  512, 196,  112,  512,  196,  144,  512,  196, 288,  1296,
295       196,   32,  512, 196,  64,   800,  196,  64,   512,  196, 256,  528,
296       196,   160, 528, 196,  320,  1440, 196,  32,   528,  196, 128,  800,
297       196,   128, 528, 49,   256,  832,  49,   160,  832,  49,  320,  1440,
298       49,    48,  832, 49,   128,  1200, 49,   128,  832,  49,  384,  832,
299       49,    192, 832, 49,   384,  1728, 49,   48,   832,  49,  128,  1200,
300       49,    128, 832, 16,   128,  508,  1,    1024, 2048, 1,   1008, 1024,
301       16,    128, 528, 1,    1024, 2048, 1,    1008, 1024, 1,   1008, 1024,
302   };
303   assert(sizeof(googlenet_gemm_sizes) % (3 * sizeof(googlenet_gemm_sizes[0])) ==
304          0);
305   const std::size_t num_googlenet_gemms =
306       sizeof(googlenet_gemm_sizes) / (3 * sizeof(googlenet_gemm_sizes[0]));
307 
308   std::vector<gemm_t> googlenet_gemms(num_googlenet_gemms);
309   for (std::size_t i = 0; i < num_googlenet_gemms; i++) {
310     googlenet_gemms[i].rows = googlenet_gemm_sizes[3 * i + 1];
311     googlenet_gemms[i].depth = googlenet_gemm_sizes[3 * i + 2];
312     googlenet_gemms[i].cols = googlenet_gemm_sizes[3 * i + 0];
313   }
314 
315   const double mintime = 20.0;
316   benchmark_gemm_sizes(context, googlenet_gemms, mintime);
317 }
318 
benchmark_small_model(GemmContext * context)319 void benchmark_small_model(GemmContext* context) {
320   // These are the m, n, k sizes for a small model with large batches.
321   const int small_model_gemm_sizes[] = {
322       29232, 16, 25, 7308, 6, 400, 203, 3002, 216,
323   };
324   assert(sizeof(small_model_gemm_sizes) %
325              (3 * sizeof(small_model_gemm_sizes[0])) ==
326          0);
327   const std::size_t num_small_model_gemms =
328       sizeof(small_model_gemm_sizes) / (3 * sizeof(small_model_gemm_sizes[0]));
329 
330   std::vector<gemm_t> small_model_gemms(num_small_model_gemms);
331   for (std::size_t i = 0; i < num_small_model_gemms; i++) {
332     small_model_gemms[i].rows = small_model_gemm_sizes[3 * i + 1];
333     small_model_gemms[i].depth = small_model_gemm_sizes[3 * i + 2];
334     small_model_gemms[i].cols = small_model_gemm_sizes[3 * i + 0];
335   }
336 
337   const double mintime = 10.0;
338   benchmark_gemm_sizes(context, small_model_gemms, mintime);
339 }
340 
benchmark_all()341 void benchmark_all() {
342   {
343     gemmlowp::GemmContext context;
344     std::cout << "Benchmarking small model GEMMs..." << std::endl;
345     gemmlowp::benchmark_small_model(&context);
346   }
347 
348   {
349     gemmlowp::GemmContext context;
350     std::cout << "Benchmarking typical GoogLeNet GEMMs..." << std::endl;
351     gemmlowp::benchmark_googlenet(&context);
352   }
353 
354   {
355     gemmlowp::GemmContext context;
356     context.set_max_num_threads(0);
357     std::cout << "Benchmarking multi-threaded mode..." << std::endl;
358     gemmlowp::benchmark(&context);
359   }
360 
361   {
362     gemmlowp::GemmContext context;
363     context.set_max_num_threads(1);
364     std::cout << "Benchmarking single-threaded mode..." << std::endl;
365     gemmlowp::benchmark(&context);
366   }
367 }
368 
369 }  // end namespace gemmlowp
370 
371 // For iOS, we need to define our own main(), so skip it here.
372 #if !(defined(__APPLE__) && (TARGET_OS_IPHONE || TARGET_IPHONE_SIMULATOR))
main()373 int main() { gemmlowp::benchmark_all(); }
374 #endif
375