1 // Copyright 2015 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include <unistd.h>
16 #ifdef __APPLE__
17 #include <sys/time.h>
18 #endif
19
20 #include <cstdint>
21 #include <cstdlib>
22 #include <ctime>
23 #include <iostream>
24 #include <map>
25 #include <vector>
26 #ifdef __APPLE__
27 #include <TargetConditionals.h>
28 #endif
29
30 #include "test.h"
31
32 #ifndef GEMMLOWP_TEST_BIT_DEPTH_PARAMS
33 #define GEMMLOWP_TEST_BIT_DEPTH_PARAMS DefaultL8R8BitDepthParams
34 #endif
35
36 #if defined(__arm__) && !defined(GEMMLOWP_NEON)
37 #warning "Building without NEON support on ARM, check your compiler setup!"
38 #endif
39
40 namespace gemmlowp {
41
time()42 double time() {
43 #ifdef __APPLE__
44 timeval t;
45 gettimeofday(&t, nullptr);
46 return t.tv_sec + 1e-6 * t.tv_usec;
47 #else
48 timespec t;
49 clock_gettime(CLOCK_REALTIME, &t);
50 return t.tv_sec + 1e-9 * t.tv_nsec;
51 #endif
52 }
53
54 const double min_accurate_duration = 1e-1;
55 const std::size_t min_working_set_size = 16 * 1024 * 1024;
56
57 struct gemm_t {
58 int rows, depth, cols;
gemm_tgemmlowp::gemm_t59 gemm_t() : rows(0), depth(0), cols(0) {}
gemm_tgemmlowp::gemm_t60 gemm_t(int r, int d, int c) : rows(r), depth(d), cols(c) {}
61 };
62
operator <(const gemm_t & a,const gemm_t & b)63 bool operator<(const gemm_t& a, const gemm_t& b) {
64 return a.rows < b.rows ||
65 (a.rows <= b.rows &&
66 (a.depth < b.depth || (a.depth <= b.depth && (a.cols < b.cols))));
67 }
68
69 template <typename LhsType, typename RhsType, typename ResultType>
time_for_gemms(GemmContext * context,const std::vector<gemm_t> & gemms)70 double time_for_gemms(GemmContext* context, const std::vector<gemm_t>& gemms) {
71 typedef std::uint8_t Scalar;
72
73 // set up the matrix pool
74
75 std::size_t combined_gemm_sizes = 0;
76 for (auto gemm : gemms) {
77 int rows = gemm.rows;
78 int depth = gemm.depth;
79 int cols = gemm.cols;
80 combined_gemm_sizes +=
81 sizeof(Scalar) * (rows * depth + depth * cols + rows * cols);
82 }
83
84 const std::size_t pool_size = 1 + min_working_set_size / combined_gemm_sizes;
85
86 std::vector<LhsType> lhs(pool_size * gemms.size());
87 std::vector<RhsType> rhs(pool_size * gemms.size());
88 std::vector<ResultType> result(pool_size * gemms.size());
89
90 for (std::size_t i = 0; i < pool_size; i++) {
91 for (std::size_t j = 0; j < gemms.size(); j++) {
92 int k = i * gemms.size() + j;
93 lhs[k].Resize(gemms[j].rows, gemms[j].depth);
94 MakeConstant(&lhs[k], 0);
95 rhs[k].Resize(gemms[j].depth, gemms[j].cols);
96 MakeConstant(&rhs[k], 0);
97 result[k].Resize(gemms[j].rows, gemms[j].cols);
98 MakeConstant(&result[k], 0);
99 }
100 }
101
102 // main benchmark loop
103
104 int iters_at_a_time = 1;
105 float time_per_iter = 0.0f;
106 std::size_t pool_index = 0;
107
108 while (true) {
109 double starttime = time();
110 for (int i = 0; i < iters_at_a_time; i++) {
111 for (size_t j = 0; j < gemms.size(); j++) {
112 int k = pool_index * gemms.size() + j;
113 Gemm<std::uint8_t, GEMMLOWP_TEST_BIT_DEPTH_PARAMS>(
114 context, lhs[k].const_map(), rhs[k].const_map(), &result[k].map(),
115 -75, -91, 74980, 123, 20);
116 }
117 pool_index++;
118 if (pool_index == pool_size) {
119 pool_index = 0;
120 }
121 }
122 double endtime = time();
123
124 const float timing = static_cast<float>(endtime - starttime);
125
126 if (timing >= min_accurate_duration) {
127 time_per_iter = timing / iters_at_a_time;
128 break;
129 }
130
131 iters_at_a_time *= 2;
132 }
133
134 return time_per_iter;
135 }
136
137 template <typename LhsType, typename RhsType, typename ResultType>
gflops_for_gemms(GemmContext * context,const std::vector<gemm_t> & gemms)138 double gflops_for_gemms(GemmContext* context,
139 const std::vector<gemm_t>& gemms) {
140 const double time_per_iter =
141 time_for_gemms<LhsType, RhsType, ResultType>(context, gemms);
142 double ops = 0;
143 for (auto gemm : gemms) {
144 ops += 2.0 * gemm.rows * gemm.depth * gemm.cols;
145 }
146 return 1e-9 * ops / time_per_iter;
147 }
148
benchmark(GemmContext * context)149 void benchmark(GemmContext* context) {
150 std::map<gemm_t, std::vector<double>> benchmark_results;
151
152 std::vector<gemm_t> benchmark_gemms;
153 benchmark_gemms.emplace_back(10, 10, 10);
154 benchmark_gemms.emplace_back(20, 20, 20);
155 benchmark_gemms.emplace_back(30, 30, 30);
156 benchmark_gemms.emplace_back(40, 40, 40);
157 benchmark_gemms.emplace_back(50, 50, 50);
158 benchmark_gemms.emplace_back(60, 60, 60);
159 benchmark_gemms.emplace_back(64, 256, 147);
160 benchmark_gemms.emplace_back(100, 100, 1);
161 benchmark_gemms.emplace_back(100, 100, 100);
162 benchmark_gemms.emplace_back(100, 1000, 100);
163 benchmark_gemms.emplace_back(1000, 1000, 1);
164 benchmark_gemms.emplace_back(1000, 1000, 10);
165 benchmark_gemms.emplace_back(1000, 1000, 100);
166 benchmark_gemms.emplace_back(1000, 1000, 1000);
167
168 const int repeat = 2;
169
170 typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
171 typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
172 typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
173
174 #ifdef GEMMLOWP_TEST_PROFILE
175 gemmlowp::RegisterCurrentThreadForProfiling();
176 gemmlowp::StartProfiling();
177 #endif
178
179 // We don't record the first repetition, it's just warm-up.
180 for (int r = 0; r < repeat + 1; r++) {
181 std::cout << "repetition " << r + 1 << "/" << repeat + 1 << "...\r"
182 << std::flush;
183 for (auto gemm : benchmark_gemms) {
184 double gflops = 0;
185 std::vector<gemm_t> unique_gemm;
186 unique_gemm.push_back(gemm);
187 gflops =
188 gflops_for_gemms<LhsType, RhsType, ResultType>(context, unique_gemm);
189 if (r > 0) {
190 benchmark_results[gemm].emplace_back(gflops);
191 }
192 }
193 }
194
195 #ifdef GEMMLOWP_TEST_PROFILE
196 gemmlowp::FinishProfiling();
197 #endif
198
199 std::cout << " \r"
200 << std::flush;
201
202 std::cout.precision(4);
203
204 for (auto b : benchmark_results) {
205 sort(b.second.begin(), b.second.end());
206 std::cout << b.first.rows << "x" << b.first.depth << "x" << b.first.cols
207 << " : " << b.second.back() << " GFlops/s" << std::endl;
208 }
209 std::cout << std::endl;
210 }
211
benchmark_gemm_sizes(GemmContext * context,const std::vector<gemm_t> & gemms,double mintime)212 void benchmark_gemm_sizes(GemmContext* context,
213 const std::vector<gemm_t>& gemms, double mintime) {
214 typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
215 typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
216 typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
217
218 std::vector<float> gemm_times;
219 std::cout << "running for " << mintime << " seconds..." << std::endl;
220
221 #ifdef GEMMLOWP_TEST_PROFILE
222 gemmlowp::RegisterCurrentThreadForProfiling();
223 gemmlowp::StartProfiling();
224 #endif
225
226 double starttime = time();
227 while (time() < starttime + mintime) {
228 gemm_times.push_back(
229 time_for_gemms<LhsType, RhsType, ResultType>(context, gemms));
230 }
231
232 #ifdef GEMMLOWP_TEST_PROFILE
233 gemmlowp::FinishProfiling();
234 #endif
235
236 std::sort(gemm_times.begin(), gemm_times.end());
237
238 double sum_gemm_times = 0;
239 double sum_gemm_times_trimmed = 0;
240 int count_gemm_times_trimmed = 0;
241 const float trim_ratio = 0.25;
242 const size_t count_trimmed = gemm_times.size() * trim_ratio;
243 double sum_gemm_times_best = 0;
244 int count_gemm_times_best = 0;
245 const float best_ratio = 0.1;
246 const size_t count_best = gemm_times.size() * best_ratio;
247
248 for (size_t i = 0; i < gemm_times.size(); i++) {
249 sum_gemm_times += gemm_times[i];
250 if (i >= count_trimmed && i < gemm_times.size() - count_trimmed) {
251 sum_gemm_times_trimmed += gemm_times[i];
252 count_gemm_times_trimmed++;
253 }
254 if (i < count_best) {
255 sum_gemm_times_best += gemm_times[i];
256 count_gemm_times_best++;
257 }
258 }
259
260 const double min_latency = gemm_times.front();
261 const double max_latency = gemm_times.back();
262 const double mean_latency = sum_gemm_times / gemm_times.size();
263 const double trimmed_mean_latency =
264 sum_gemm_times_trimmed / count_gemm_times_trimmed;
265 const double best_mean_latency = sum_gemm_times_best / count_gemm_times_best;
266
267 std::cout << "Graph latency (over " << gemm_times.size()
268 << " iterations):" << std::endl;
269 std::cout << " Best: " << min_latency << "s" << std::endl;
270 std::cout << " Worst: " << max_latency << "s" << std::endl;
271 std::cout << " Mean: " << mean_latency << "s" << std::endl;
272 std::cout << " " << 100 * trim_ratio
273 << "% trimmed mean: " << trimmed_mean_latency << "s" << std::endl;
274 std::cout << " Mean of " << 100 * best_ratio
275 << "% best: " << best_mean_latency << "s" << std::endl;
276 }
277
benchmark_googlenet(GemmContext * context)278 void benchmark_googlenet(GemmContext* context) {
279 // These are the m, n, k sizes for a typical GoogLeNet.
280 const int googlenet_gemm_sizes[] = {
281 12544, 64, 147, 3136, 64, 64, 3136, 192, 576, 784, 64, 192,
282 784, 96, 192, 784, 128, 864, 784, 16, 192, 784, 32, 400,
283 784, 32, 192, 784, 128, 256, 784, 128, 256, 784, 192, 1152,
284 784, 32, 256, 784, 96, 800, 784, 64, 256, 196, 192, 480,
285 196, 96, 480, 196, 204, 864, 196, 16, 480, 196, 48, 400,
286 196, 64, 480, 196, 160, 508, 196, 112, 508, 196, 224, 1008,
287 196, 24, 508, 196, 64, 600, 196, 64, 508, 196, 128, 512,
288 196, 128, 512, 196, 256, 1152, 196, 24, 512, 196, 64, 600,
289 196, 64, 512, 196, 112, 512, 196, 144, 512, 196, 288, 1296,
290 196, 32, 512, 196, 64, 800, 196, 64, 512, 196, 256, 528,
291 196, 160, 528, 196, 320, 1440, 196, 32, 528, 196, 128, 800,
292 196, 128, 528, 49, 256, 832, 49, 160, 832, 49, 320, 1440,
293 49, 48, 832, 49, 128, 1200, 49, 128, 832, 49, 384, 832,
294 49, 192, 832, 49, 384, 1728, 49, 48, 832, 49, 128, 1200,
295 49, 128, 832, 16, 128, 508, 1, 1024, 2048, 1, 1008, 1024,
296 16, 128, 528, 1, 1024, 2048, 1, 1008, 1024, 1, 1008, 1024,
297 };
298 assert(sizeof(googlenet_gemm_sizes) % (3 * sizeof(googlenet_gemm_sizes[0])) ==
299 0);
300 const std::size_t num_googlenet_gemms =
301 sizeof(googlenet_gemm_sizes) / (3 * sizeof(googlenet_gemm_sizes[0]));
302
303 std::vector<gemm_t> googlenet_gemms(num_googlenet_gemms);
304 for (std::size_t i = 0; i < num_googlenet_gemms; i++) {
305 googlenet_gemms[i].rows = googlenet_gemm_sizes[3 * i + 1];
306 googlenet_gemms[i].depth = googlenet_gemm_sizes[3 * i + 2];
307 googlenet_gemms[i].cols = googlenet_gemm_sizes[3 * i + 0];
308 }
309
310 const double mintime = 20.0;
311 benchmark_gemm_sizes(context, googlenet_gemms, mintime);
312 }
313
benchmark_small_model(GemmContext * context)314 void benchmark_small_model(GemmContext* context) {
315 // These are the m, n, k sizes for a small model with large batches.
316 const int small_model_gemm_sizes[] = {
317 29232, 16, 25, 7308, 6, 400, 203, 3002, 216,
318 };
319 assert(sizeof(small_model_gemm_sizes) %
320 (3 * sizeof(small_model_gemm_sizes[0])) ==
321 0);
322 const std::size_t num_small_model_gemms =
323 sizeof(small_model_gemm_sizes) / (3 * sizeof(small_model_gemm_sizes[0]));
324
325 std::vector<gemm_t> small_model_gemms(num_small_model_gemms);
326 for (std::size_t i = 0; i < num_small_model_gemms; i++) {
327 small_model_gemms[i].rows = small_model_gemm_sizes[3 * i + 1];
328 small_model_gemms[i].depth = small_model_gemm_sizes[3 * i + 2];
329 small_model_gemms[i].cols = small_model_gemm_sizes[3 * i + 0];
330 }
331
332 const double mintime = 10.0;
333 benchmark_gemm_sizes(context, small_model_gemms, mintime);
334 }
335
benchmark_all()336 void benchmark_all() {
337 {
338 gemmlowp::GemmContext context;
339 std::cout << "Benchmarking small model GEMMs..." << std::endl;
340 gemmlowp::benchmark_small_model(&context);
341 }
342
343 {
344 gemmlowp::GemmContext context;
345 std::cout << "Benchmarking typical GoogLeNet GEMMs..." << std::endl;
346 gemmlowp::benchmark_googlenet(&context);
347 }
348
349 {
350 gemmlowp::GemmContext context;
351 std::cout << "Benchmarking default mode (typically multi-threaded)..."
352 << std::endl;
353 gemmlowp::benchmark(&context);
354 }
355
356 {
357 gemmlowp::GemmContext context;
358 context.set_max_num_threads(1);
359 std::cout << "Benchmarking single-threaded mode..." << std::endl;
360 gemmlowp::benchmark(&context);
361 }
362 }
363
364 } // end namespace gemmlowp
365
366 // For iOS, we need to define our own main(), so skip it here.
367 #if !(defined(__APPLE__) && (TARGET_OS_IPHONE || TARGET_IPHONE_SIMULATOR))
main()368 int main() { gemmlowp::benchmark_all(); }
369 #endif
370