1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include <unistd.h>
16 #ifdef __APPLE__
17 #include <sys/time.h>
18 #endif
19
20 #include <cstdint>
21 #include <cstdlib>
22 #include <ctime>
23 #include <iostream>
24 #include <map>
25 #include <vector>
26 #ifdef __APPLE__
27 #include <TargetConditionals.h>
28 #endif
29
30 #include "test.h"
31
32 #ifndef GEMMLOWP_TEST_BIT_DEPTH_PARAMS
33 #define GEMMLOWP_TEST_BIT_DEPTH_PARAMS DefaultL8R8BitDepthParams
34 #endif
35
36 #if defined(__arm__) && !defined(GEMMLOWP_NEON)
37 #warning "Building without NEON support on ARM, check your compiler setup!"
38 #endif
39
40 #if defined(__SSE4_2__) && !defined(GEMMLOWP_SSE4)
41 #warning \
42 "Building without SSE4.2 support on SSE4.2 enabled machine, check your compiler setup!"
43 #endif
44
45 namespace gemmlowp {
46
time()47 double time() {
48 #ifdef __APPLE__
49 timeval t;
50 gettimeofday(&t, nullptr);
51 return t.tv_sec + 1e-6 * t.tv_usec;
52 #else
53 timespec t;
54 clock_gettime(CLOCK_REALTIME, &t);
55 return t.tv_sec + 1e-9 * t.tv_nsec;
56 #endif
57 }
58
59 const double min_accurate_duration = 1e-1;
60 const std::size_t min_working_set_size = 16 * 1024 * 1024;
61
62 struct gemm_t {
63 int rows, depth, cols;
gemm_tgemmlowp::gemm_t64 gemm_t() : rows(0), depth(0), cols(0) {}
gemm_tgemmlowp::gemm_t65 gemm_t(int r, int d, int c) : rows(r), depth(d), cols(c) {}
66 };
67
operator <(const gemm_t & a,const gemm_t & b)68 bool operator<(const gemm_t& a, const gemm_t& b) {
69 return a.rows < b.rows ||
70 (a.rows <= b.rows &&
71 (a.depth < b.depth || (a.depth <= b.depth && (a.cols < b.cols))));
72 }
73
74 template <typename LhsType, typename RhsType, typename ResultType>
time_for_gemms(GemmContext * context,const std::vector<gemm_t> & gemms)75 double time_for_gemms(GemmContext* context, const std::vector<gemm_t>& gemms) {
76 typedef std::uint8_t Scalar;
77
78 // set up the matrix pool
79
80 std::size_t combined_gemm_sizes = 0;
81 for (auto gemm : gemms) {
82 int rows = gemm.rows;
83 int depth = gemm.depth;
84 int cols = gemm.cols;
85 combined_gemm_sizes +=
86 sizeof(Scalar) * (rows * depth + depth * cols + rows * cols);
87 }
88
89 const std::size_t pool_size = 1 + min_working_set_size / combined_gemm_sizes;
90
91 std::vector<LhsType> lhs(pool_size * gemms.size());
92 std::vector<RhsType> rhs(pool_size * gemms.size());
93 std::vector<ResultType> result(pool_size * gemms.size());
94
95 for (std::size_t i = 0; i < pool_size; i++) {
96 for (std::size_t j = 0; j < gemms.size(); j++) {
97 int k = i * gemms.size() + j;
98 lhs[k].Resize(gemms[j].rows, gemms[j].depth);
99 MakeConstant(&lhs[k], 0);
100 rhs[k].Resize(gemms[j].depth, gemms[j].cols);
101 MakeConstant(&rhs[k], 0);
102 result[k].Resize(gemms[j].rows, gemms[j].cols);
103 MakeConstant(&result[k], 0);
104 }
105 }
106
107 // main benchmark loop
108
109 int iters_at_a_time = 1;
110 float time_per_iter = 0.0f;
111 std::size_t pool_index = 0;
112
113 while (true) {
114 double starttime = time();
115 for (int i = 0; i < iters_at_a_time; i++) {
116 for (size_t j = 0; j < gemms.size(); j++) {
117 int k = pool_index * gemms.size() + j;
118 Gemm<std::uint8_t, GEMMLOWP_TEST_BIT_DEPTH_PARAMS>(
119 context, lhs[k].const_map(), rhs[k].const_map(), &result[k].map(),
120 -75, -91, 74980, 123, 20);
121 }
122 pool_index++;
123 if (pool_index == pool_size) {
124 pool_index = 0;
125 }
126 }
127 double endtime = time();
128
129 const float timing = static_cast<float>(endtime - starttime);
130
131 if (timing >= min_accurate_duration) {
132 time_per_iter = timing / iters_at_a_time;
133 break;
134 }
135
136 iters_at_a_time *= 2;
137 }
138
139 return time_per_iter;
140 }
141
142 template <typename LhsType, typename RhsType, typename ResultType>
gflops_for_gemms(GemmContext * context,const std::vector<gemm_t> & gemms)143 double gflops_for_gemms(GemmContext* context,
144 const std::vector<gemm_t>& gemms) {
145 const double time_per_iter =
146 time_for_gemms<LhsType, RhsType, ResultType>(context, gemms);
147 double ops = 0;
148 for (auto gemm : gemms) {
149 ops += 2.0 * gemm.rows * gemm.depth * gemm.cols;
150 }
151 return 1e-9 * ops / time_per_iter;
152 }
153
benchmark(GemmContext * context)154 void benchmark(GemmContext* context) {
155 std::map<gemm_t, std::vector<double>> benchmark_results;
156
157 std::vector<gemm_t> benchmark_gemms;
158 benchmark_gemms.emplace_back(10, 10, 10);
159 benchmark_gemms.emplace_back(20, 20, 20);
160 benchmark_gemms.emplace_back(30, 30, 30);
161 benchmark_gemms.emplace_back(40, 40, 40);
162 benchmark_gemms.emplace_back(50, 50, 50);
163 benchmark_gemms.emplace_back(60, 60, 60);
164 benchmark_gemms.emplace_back(64, 256, 147);
165 benchmark_gemms.emplace_back(100, 100, 1);
166 benchmark_gemms.emplace_back(100, 100, 100);
167 benchmark_gemms.emplace_back(100, 1000, 100);
168 benchmark_gemms.emplace_back(1000, 1000, 1);
169 benchmark_gemms.emplace_back(1000, 1000, 10);
170 benchmark_gemms.emplace_back(1000, 1000, 100);
171 benchmark_gemms.emplace_back(1000, 1000, 1000);
172
173 const int repeat = 2;
174
175 typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
176 typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
177 typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
178
179 #ifdef GEMMLOWP_TEST_PROFILE
180 gemmlowp::RegisterCurrentThreadForProfiling();
181 gemmlowp::StartProfiling();
182 #endif
183
184 // We don't record the first repetition, it's just warm-up.
185 for (int r = 0; r < repeat + 1; r++) {
186 std::cout << "repetition " << r + 1 << "/" << repeat + 1 << "...\r"
187 << std::flush;
188 for (auto gemm : benchmark_gemms) {
189 double gflops = 0;
190 std::vector<gemm_t> unique_gemm;
191 unique_gemm.push_back(gemm);
192 gflops =
193 gflops_for_gemms<LhsType, RhsType, ResultType>(context, unique_gemm);
194 if (r > 0) {
195 benchmark_results[gemm].emplace_back(gflops);
196 }
197 }
198 }
199
200 #ifdef GEMMLOWP_TEST_PROFILE
201 gemmlowp::FinishProfiling();
202 #endif
203
204 std::cout << " \r"
205 << std::flush;
206
207 std::cout.precision(4);
208
209 for (auto b : benchmark_results) {
210 sort(b.second.begin(), b.second.end());
211 std::cout << b.first.rows << "x" << b.first.depth << "x" << b.first.cols
212 << " : " << b.second.back() << " GFlops/s" << std::endl;
213 }
214 std::cout << std::endl;
215 }
216
benchmark_gemm_sizes(GemmContext * context,const std::vector<gemm_t> & gemms,double mintime)217 void benchmark_gemm_sizes(GemmContext* context,
218 const std::vector<gemm_t>& gemms, double mintime) {
219 typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
220 typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
221 typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
222
223 std::vector<float> gemm_times;
224 std::cout << "running for " << mintime << " seconds..." << std::endl;
225
226 #ifdef GEMMLOWP_TEST_PROFILE
227 gemmlowp::RegisterCurrentThreadForProfiling();
228 gemmlowp::StartProfiling();
229 #endif
230
231 double starttime = time();
232 while (time() < starttime + mintime) {
233 gemm_times.push_back(
234 time_for_gemms<LhsType, RhsType, ResultType>(context, gemms));
235 }
236
237 #ifdef GEMMLOWP_TEST_PROFILE
238 gemmlowp::FinishProfiling();
239 #endif
240
241 std::sort(gemm_times.begin(), gemm_times.end());
242
243 double sum_gemm_times = 0;
244 double sum_gemm_times_trimmed = 0;
245 int count_gemm_times_trimmed = 0;
246 const float trim_ratio = 0.25;
247 const size_t count_trimmed = gemm_times.size() * trim_ratio;
248 double sum_gemm_times_best = 0;
249 int count_gemm_times_best = 0;
250 const float best_ratio = 0.1;
251 const size_t count_best = gemm_times.size() * best_ratio;
252
253 for (size_t i = 0; i < gemm_times.size(); i++) {
254 sum_gemm_times += gemm_times[i];
255 if (i >= count_trimmed && i < gemm_times.size() - count_trimmed) {
256 sum_gemm_times_trimmed += gemm_times[i];
257 count_gemm_times_trimmed++;
258 }
259 if (i < count_best) {
260 sum_gemm_times_best += gemm_times[i];
261 count_gemm_times_best++;
262 }
263 }
264
265 const double min_latency = gemm_times.front();
266 const double max_latency = gemm_times.back();
267 const double mean_latency = sum_gemm_times / gemm_times.size();
268 const double trimmed_mean_latency =
269 sum_gemm_times_trimmed / count_gemm_times_trimmed;
270 const double best_mean_latency = sum_gemm_times_best / count_gemm_times_best;
271
272 std::cout << "Graph latency (over " << gemm_times.size()
273 << " iterations):" << std::endl;
274 std::cout << " Best: " << min_latency << "s" << std::endl;
275 std::cout << " Worst: " << max_latency << "s" << std::endl;
276 std::cout << " Mean: " << mean_latency << "s" << std::endl;
277 std::cout << " " << 100 * trim_ratio
278 << "% trimmed mean: " << trimmed_mean_latency << "s" << std::endl;
279 std::cout << " Mean of " << 100 * best_ratio
280 << "% best: " << best_mean_latency << "s" << std::endl;
281 }
282
benchmark_googlenet(GemmContext * context)283 void benchmark_googlenet(GemmContext* context) {
284 // These are the m, n, k sizes for a typical GoogLeNet.
285 const int googlenet_gemm_sizes[] = {
286 12544, 64, 147, 3136, 64, 64, 3136, 192, 576, 784, 64, 192,
287 784, 96, 192, 784, 128, 864, 784, 16, 192, 784, 32, 400,
288 784, 32, 192, 784, 128, 256, 784, 128, 256, 784, 192, 1152,
289 784, 32, 256, 784, 96, 800, 784, 64, 256, 196, 192, 480,
290 196, 96, 480, 196, 204, 864, 196, 16, 480, 196, 48, 400,
291 196, 64, 480, 196, 160, 508, 196, 112, 508, 196, 224, 1008,
292 196, 24, 508, 196, 64, 600, 196, 64, 508, 196, 128, 512,
293 196, 128, 512, 196, 256, 1152, 196, 24, 512, 196, 64, 600,
294 196, 64, 512, 196, 112, 512, 196, 144, 512, 196, 288, 1296,
295 196, 32, 512, 196, 64, 800, 196, 64, 512, 196, 256, 528,
296 196, 160, 528, 196, 320, 1440, 196, 32, 528, 196, 128, 800,
297 196, 128, 528, 49, 256, 832, 49, 160, 832, 49, 320, 1440,
298 49, 48, 832, 49, 128, 1200, 49, 128, 832, 49, 384, 832,
299 49, 192, 832, 49, 384, 1728, 49, 48, 832, 49, 128, 1200,
300 49, 128, 832, 16, 128, 508, 1, 1024, 2048, 1, 1008, 1024,
301 16, 128, 528, 1, 1024, 2048, 1, 1008, 1024, 1, 1008, 1024,
302 };
303 assert(sizeof(googlenet_gemm_sizes) % (3 * sizeof(googlenet_gemm_sizes[0])) ==
304 0);
305 const std::size_t num_googlenet_gemms =
306 sizeof(googlenet_gemm_sizes) / (3 * sizeof(googlenet_gemm_sizes[0]));
307
308 std::vector<gemm_t> googlenet_gemms(num_googlenet_gemms);
309 for (std::size_t i = 0; i < num_googlenet_gemms; i++) {
310 googlenet_gemms[i].rows = googlenet_gemm_sizes[3 * i + 1];
311 googlenet_gemms[i].depth = googlenet_gemm_sizes[3 * i + 2];
312 googlenet_gemms[i].cols = googlenet_gemm_sizes[3 * i + 0];
313 }
314
315 const double mintime = 20.0;
316 benchmark_gemm_sizes(context, googlenet_gemms, mintime);
317 }
318
benchmark_small_model(GemmContext * context)319 void benchmark_small_model(GemmContext* context) {
320 // These are the m, n, k sizes for a small model with large batches.
321 const int small_model_gemm_sizes[] = {
322 29232, 16, 25, 7308, 6, 400, 203, 3002, 216,
323 };
324 assert(sizeof(small_model_gemm_sizes) %
325 (3 * sizeof(small_model_gemm_sizes[0])) ==
326 0);
327 const std::size_t num_small_model_gemms =
328 sizeof(small_model_gemm_sizes) / (3 * sizeof(small_model_gemm_sizes[0]));
329
330 std::vector<gemm_t> small_model_gemms(num_small_model_gemms);
331 for (std::size_t i = 0; i < num_small_model_gemms; i++) {
332 small_model_gemms[i].rows = small_model_gemm_sizes[3 * i + 1];
333 small_model_gemms[i].depth = small_model_gemm_sizes[3 * i + 2];
334 small_model_gemms[i].cols = small_model_gemm_sizes[3 * i + 0];
335 }
336
337 const double mintime = 10.0;
338 benchmark_gemm_sizes(context, small_model_gemms, mintime);
339 }
340
benchmark_all()341 void benchmark_all() {
342 {
343 gemmlowp::GemmContext context;
344 std::cout << "Benchmarking small model GEMMs..." << std::endl;
345 gemmlowp::benchmark_small_model(&context);
346 }
347
348 {
349 gemmlowp::GemmContext context;
350 std::cout << "Benchmarking typical GoogLeNet GEMMs..." << std::endl;
351 gemmlowp::benchmark_googlenet(&context);
352 }
353
354 {
355 gemmlowp::GemmContext context;
356 context.set_max_num_threads(0);
357 std::cout << "Benchmarking multi-threaded mode..." << std::endl;
358 gemmlowp::benchmark(&context);
359 }
360
361 {
362 gemmlowp::GemmContext context;
363 context.set_max_num_threads(1);
364 std::cout << "Benchmarking single-threaded mode..." << std::endl;
365 gemmlowp::benchmark(&context);
366 }
367 }
368
369 } // end namespace gemmlowp
370
371 // For iOS, we need to define our own main(), so skip it here.
372 #if !(defined(__APPLE__) && (TARGET_OS_IPHONE || TARGET_IPHONE_SIMULATOR))
main()373 int main() { gemmlowp::benchmark_all(); }
374 #endif
375