1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #ifdef __APPLE__
16 #include <sys/time.h>
17 #endif
18
19 #include <cstdint>
20 #include <cstdlib>
21 #include <ctime>
22 #include <iostream>
23 #include <map>
24 #include <vector>
25 #ifdef __APPLE__
26 #include <TargetConditionals.h>
27 #endif
28
29 #include "test.h"
30
31 #ifndef GEMMLOWP_TEST_BIT_DEPTH_PARAMS
32 #define GEMMLOWP_TEST_BIT_DEPTH_PARAMS DefaultL8R8BitDepthParams
33 #endif
34
35 #if defined(__arm__) && !defined(GEMMLOWP_NEON)
36 #warning "Building without NEON support on ARM, check your compiler setup!"
37 #endif
38
39 #if defined(__SSE4_2__) && !defined(GEMMLOWP_SSE4)
40 #warning \
41 "Building without SSE4.2 support on SSE4.2 enabled machine, check your compiler setup!"
42 #endif
43
44 namespace gemmlowp {
45
46 const double min_accurate_duration = 1e-1;
47 const std::size_t min_working_set_size = 16 * 1024 * 1024;
48
49 struct gemm_t {
50 int rows, depth, cols;
gemm_tgemmlowp::gemm_t51 gemm_t() : rows(0), depth(0), cols(0) {}
gemm_tgemmlowp::gemm_t52 gemm_t(int r, int d, int c) : rows(r), depth(d), cols(c) {}
53 };
54
operator <(const gemm_t & a,const gemm_t & b)55 bool operator<(const gemm_t& a, const gemm_t& b) {
56 return a.rows < b.rows ||
57 (a.rows <= b.rows &&
58 (a.depth < b.depth || (a.depth <= b.depth && (a.cols < b.cols))));
59 }
60
61 template <typename LhsType, typename RhsType, typename ResultType>
time_for_gemms(GemmContext * context,const std::vector<gemm_t> & gemms)62 double time_for_gemms(GemmContext* context, const std::vector<gemm_t>& gemms) {
63 typedef std::uint8_t Scalar;
64
65 // set up the matrix pool
66
67 std::size_t combined_gemm_sizes = 0;
68 for (auto gemm : gemms) {
69 int rows = gemm.rows;
70 int depth = gemm.depth;
71 int cols = gemm.cols;
72 combined_gemm_sizes +=
73 sizeof(Scalar) * (rows * depth + depth * cols + rows * cols);
74 }
75
76 const std::size_t pool_size = 1 + min_working_set_size / combined_gemm_sizes;
77
78 std::vector<LhsType> lhs(pool_size * gemms.size());
79 std::vector<RhsType> rhs(pool_size * gemms.size());
80 std::vector<ResultType> result(pool_size * gemms.size());
81
82 for (std::size_t i = 0; i < pool_size; i++) {
83 for (std::size_t j = 0; j < gemms.size(); j++) {
84 int k = i * gemms.size() + j;
85 lhs[k].Resize(gemms[j].rows, gemms[j].depth);
86 MakeConstant(&lhs[k], 0);
87 rhs[k].Resize(gemms[j].depth, gemms[j].cols);
88 MakeConstant(&rhs[k], 0);
89 result[k].Resize(gemms[j].rows, gemms[j].cols);
90 MakeConstant(&result[k], 0);
91 }
92 }
93
94 // main benchmark loop
95
96 int iters_at_a_time = 1;
97 float time_per_iter = 0.0f;
98 std::size_t pool_index = 0;
99
100 while (true) {
101 double starttime = real_time_in_seconds();
102 for (int i = 0; i < iters_at_a_time; i++) {
103 for (size_t j = 0; j < gemms.size(); j++) {
104 size_t k = pool_index * gemms.size() + j;
105 Gemm<std::uint8_t, GEMMLOWP_TEST_BIT_DEPTH_PARAMS>(
106 context, lhs[k].const_map(), rhs[k].const_map(), &result[k].map(),
107 -75, -91, 74980, 123, 20);
108 }
109 pool_index++;
110 if (pool_index == pool_size) {
111 pool_index = 0;
112 }
113 }
114 double endtime = real_time_in_seconds();
115
116 const float timing = static_cast<float>(endtime - starttime);
117
118 if (timing >= min_accurate_duration) {
119 time_per_iter = timing / iters_at_a_time;
120 break;
121 }
122
123 iters_at_a_time *= 2;
124 }
125
126 return time_per_iter;
127 }
128
129 template <typename LhsType, typename RhsType, typename ResultType>
gflops_for_gemms(GemmContext * context,const std::vector<gemm_t> & gemms)130 double gflops_for_gemms(GemmContext* context,
131 const std::vector<gemm_t>& gemms) {
132 const double time_per_iter =
133 time_for_gemms<LhsType, RhsType, ResultType>(context, gemms);
134 double ops = 0;
135 for (auto gemm : gemms) {
136 ops += 2.0 * gemm.rows * gemm.depth * gemm.cols;
137 }
138 return 1e-9 * ops / time_per_iter;
139 }
140
benchmark(GemmContext * context)141 void benchmark(GemmContext* context) {
142 std::map<gemm_t, std::vector<double>> benchmark_results;
143
144 std::vector<gemm_t> benchmark_gemms;
145 benchmark_gemms.emplace_back(10, 10, 10);
146 benchmark_gemms.emplace_back(20, 20, 20);
147 benchmark_gemms.emplace_back(30, 30, 30);
148 benchmark_gemms.emplace_back(40, 40, 40);
149 benchmark_gemms.emplace_back(50, 50, 50);
150 benchmark_gemms.emplace_back(60, 60, 60);
151 benchmark_gemms.emplace_back(64, 256, 147);
152 benchmark_gemms.emplace_back(100, 100, 1);
153 benchmark_gemms.emplace_back(100, 100, 100);
154 benchmark_gemms.emplace_back(100, 1000, 100);
155 benchmark_gemms.emplace_back(1000, 1000, 1);
156 benchmark_gemms.emplace_back(1000, 1000, 10);
157 benchmark_gemms.emplace_back(1000, 1000, 100);
158 benchmark_gemms.emplace_back(1000, 1000, 1000);
159
160 const int repeat = 2;
161
162 typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
163 typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
164 typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
165
166 #ifdef GEMMLOWP_TEST_PROFILE
167 gemmlowp::RegisterCurrentThreadForProfiling();
168 gemmlowp::StartProfiling();
169 #endif
170
171 // We don't record the first repetition, it's just warm-up.
172 for (int r = 0; r < repeat + 1; r++) {
173 std::cout << "repetition " << r + 1 << "/" << repeat + 1 << "...\r"
174 << std::flush;
175 for (auto gemm : benchmark_gemms) {
176 double gflops = 0;
177 std::vector<gemm_t> unique_gemm;
178 unique_gemm.push_back(gemm);
179 gflops =
180 gflops_for_gemms<LhsType, RhsType, ResultType>(context, unique_gemm);
181 if (r > 0) {
182 benchmark_results[gemm].emplace_back(gflops);
183 }
184 }
185 }
186
187 #ifdef GEMMLOWP_TEST_PROFILE
188 gemmlowp::FinishProfiling();
189 #endif
190
191 std::cout << " \r"
192 << std::flush;
193
194 std::cout.precision(4);
195
196 for (auto b : benchmark_results) {
197 sort(b.second.begin(), b.second.end());
198 std::cout << b.first.rows << "x" << b.first.depth << "x" << b.first.cols
199 << " : " << b.second.back() << " GFlops/s" << std::endl;
200 }
201 std::cout << std::endl;
202 }
203
benchmark_gemm_sizes(GemmContext * context,const std::vector<gemm_t> & gemms,double mintime)204 void benchmark_gemm_sizes(GemmContext* context,
205 const std::vector<gemm_t>& gemms, double mintime) {
206 typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
207 typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
208 typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
209
210 std::vector<float> gemm_times;
211 std::cout << "running for " << mintime << " seconds..." << std::endl;
212
213 #ifdef GEMMLOWP_TEST_PROFILE
214 gemmlowp::RegisterCurrentThreadForProfiling();
215 gemmlowp::StartProfiling();
216 #endif
217
218 double starttime = real_time_in_seconds();
219 while (real_time_in_seconds() < starttime + mintime) {
220 gemm_times.push_back(
221 time_for_gemms<LhsType, RhsType, ResultType>(context, gemms));
222 }
223
224 #ifdef GEMMLOWP_TEST_PROFILE
225 gemmlowp::FinishProfiling();
226 #endif
227
228 std::sort(gemm_times.begin(), gemm_times.end());
229
230 double sum_gemm_times = 0;
231 double sum_gemm_times_trimmed = 0;
232 int count_gemm_times_trimmed = 0;
233 const float trim_ratio = 0.25;
234 const size_t count_trimmed = gemm_times.size() * trim_ratio;
235 double sum_gemm_times_best = 0;
236 int count_gemm_times_best = 0;
237 const float best_ratio = 0.1;
238 const size_t count_best = gemm_times.size() * best_ratio;
239
240 for (size_t i = 0; i < gemm_times.size(); i++) {
241 sum_gemm_times += gemm_times[i];
242 if (i >= count_trimmed && i < gemm_times.size() - count_trimmed) {
243 sum_gemm_times_trimmed += gemm_times[i];
244 count_gemm_times_trimmed++;
245 }
246 if (i < count_best) {
247 sum_gemm_times_best += gemm_times[i];
248 count_gemm_times_best++;
249 }
250 }
251
252 const double min_latency = gemm_times.front();
253 const double max_latency = gemm_times.back();
254 const double mean_latency = sum_gemm_times / gemm_times.size();
255 const double trimmed_mean_latency =
256 sum_gemm_times_trimmed / count_gemm_times_trimmed;
257 const double best_mean_latency = sum_gemm_times_best / count_gemm_times_best;
258
259 std::cout << "Graph latency (over " << gemm_times.size()
260 << " iterations):" << std::endl;
261 std::cout << " Best: " << min_latency << "s" << std::endl;
262 std::cout << " Worst: " << max_latency << "s" << std::endl;
263 std::cout << " Mean: " << mean_latency << "s" << std::endl;
264 std::cout << " " << 100 * trim_ratio
265 << "% trimmed mean: " << trimmed_mean_latency << "s" << std::endl;
266 std::cout << " Mean of " << 100 * best_ratio
267 << "% best: " << best_mean_latency << "s" << std::endl;
268 }
269
benchmark_googlenet(GemmContext * context)270 void benchmark_googlenet(GemmContext* context) {
271 // These are the m, n, k sizes for a typical GoogLeNet.
272 const int googlenet_gemm_sizes[] = {
273 12544, 64, 147, 3136, 64, 64, 3136, 192, 576, 784, 64, 192,
274 784, 96, 192, 784, 128, 864, 784, 16, 192, 784, 32, 400,
275 784, 32, 192, 784, 128, 256, 784, 128, 256, 784, 192, 1152,
276 784, 32, 256, 784, 96, 800, 784, 64, 256, 196, 192, 480,
277 196, 96, 480, 196, 204, 864, 196, 16, 480, 196, 48, 400,
278 196, 64, 480, 196, 160, 508, 196, 112, 508, 196, 224, 1008,
279 196, 24, 508, 196, 64, 600, 196, 64, 508, 196, 128, 512,
280 196, 128, 512, 196, 256, 1152, 196, 24, 512, 196, 64, 600,
281 196, 64, 512, 196, 112, 512, 196, 144, 512, 196, 288, 1296,
282 196, 32, 512, 196, 64, 800, 196, 64, 512, 196, 256, 528,
283 196, 160, 528, 196, 320, 1440, 196, 32, 528, 196, 128, 800,
284 196, 128, 528, 49, 256, 832, 49, 160, 832, 49, 320, 1440,
285 49, 48, 832, 49, 128, 1200, 49, 128, 832, 49, 384, 832,
286 49, 192, 832, 49, 384, 1728, 49, 48, 832, 49, 128, 1200,
287 49, 128, 832, 16, 128, 508, 1, 1024, 2048, 1, 1008, 1024,
288 16, 128, 528, 1, 1024, 2048, 1, 1008, 1024, 1, 1008, 1024,
289 };
290 assert(sizeof(googlenet_gemm_sizes) % (3 * sizeof(googlenet_gemm_sizes[0])) ==
291 0);
292 const std::size_t num_googlenet_gemms =
293 sizeof(googlenet_gemm_sizes) / (3 * sizeof(googlenet_gemm_sizes[0]));
294
295 std::vector<gemm_t> googlenet_gemms(num_googlenet_gemms);
296 for (std::size_t i = 0; i < num_googlenet_gemms; i++) {
297 googlenet_gemms[i].rows = googlenet_gemm_sizes[3 * i + 1];
298 googlenet_gemms[i].depth = googlenet_gemm_sizes[3 * i + 2];
299 googlenet_gemms[i].cols = googlenet_gemm_sizes[3 * i + 0];
300 }
301
302 const double mintime = 20.0;
303 benchmark_gemm_sizes(context, googlenet_gemms, mintime);
304 }
305
benchmark_small_model(GemmContext * context)306 void benchmark_small_model(GemmContext* context) {
307 // These are the m, n, k sizes for a small model with large batches.
308 const int small_model_gemm_sizes[] = {
309 29232, 16, 25, 7308, 6, 400, 203, 3002, 216,
310 };
311 assert(sizeof(small_model_gemm_sizes) %
312 (3 * sizeof(small_model_gemm_sizes[0])) ==
313 0);
314 const std::size_t num_small_model_gemms =
315 sizeof(small_model_gemm_sizes) / (3 * sizeof(small_model_gemm_sizes[0]));
316
317 std::vector<gemm_t> small_model_gemms(num_small_model_gemms);
318 for (std::size_t i = 0; i < num_small_model_gemms; i++) {
319 small_model_gemms[i].rows = small_model_gemm_sizes[3 * i + 1];
320 small_model_gemms[i].depth = small_model_gemm_sizes[3 * i + 2];
321 small_model_gemms[i].cols = small_model_gemm_sizes[3 * i + 0];
322 }
323
324 const double mintime = 10.0;
325 benchmark_gemm_sizes(context, small_model_gemms, mintime);
326 }
327
benchmark_all()328 void benchmark_all() {
329 {
330 gemmlowp::GemmContext context;
331 std::cout << "Benchmarking small model GEMMs..." << std::endl;
332 gemmlowp::benchmark_small_model(&context);
333 }
334
335 {
336 gemmlowp::GemmContext context;
337 std::cout << "Benchmarking typical GoogLeNet GEMMs..." << std::endl;
338 gemmlowp::benchmark_googlenet(&context);
339 }
340
341 {
342 gemmlowp::GemmContext context;
343 context.set_max_num_threads(0);
344 std::cout << "Benchmarking multi-threaded mode..." << std::endl;
345 gemmlowp::benchmark(&context);
346 }
347
348 {
349 gemmlowp::GemmContext context;
350 context.set_max_num_threads(1);
351 std::cout << "Benchmarking single-threaded mode..." << std::endl;
352 gemmlowp::benchmark(&context);
353 }
354 }
355
356 } // end namespace gemmlowp
357
358 // For iOS, we need to define our own main(), so skip it here.
359 #if !(defined(__APPLE__) && (TARGET_OS_IPHONE || TARGET_IPHONE_SIMULATOR))
main()360 int main() { gemmlowp::benchmark_all(); }
361 #endif
362