1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10 #include <iostream>
11 #include <cstdint>
12 #include <cstdlib>
13 #include <vector>
14 #include <fstream>
15 #include <memory>
16 #include <cstdio>
17
18 bool eigen_use_specific_block_size;
19 int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n;
20 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size
21 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k
22 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m
23 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n
24 #include <Eigen/Core>
25
26 #include <bench/BenchTimer.h>
27
28 using namespace Eigen;
29 using namespace std;
30
31 static BenchTimer timer;
32
33 // how many times we repeat each measurement.
34 // measurements are randomly shuffled - we're not doing
35 // all N identical measurements in a row.
36 const int measurement_repetitions = 3;
37
38 // Timings below this value are too short to be accurate,
39 // we'll repeat measurements with more iterations until
40 // we get a timing above that threshold.
41 const float min_accurate_time = 1e-2f;
42
43 // See --min-working-set-size command line parameter.
44 size_t min_working_set_size = 0;
45
46 float max_clock_speed = 0.0f;
47
48 // range of sizes that we will benchmark (in all 3 K,M,N dimensions)
49 const size_t maxsize = 2048;
50 const size_t minsize = 16;
51
52 typedef MatrixXf MatrixType;
53 typedef MatrixType::Scalar Scalar;
54 typedef internal::packet_traits<Scalar>::type Packet;
55
56 static_assert((maxsize & (maxsize - 1)) == 0, "maxsize must be a power of two");
57 static_assert((minsize & (minsize - 1)) == 0, "minsize must be a power of two");
58 static_assert(maxsize > minsize, "maxsize must be larger than minsize");
59 static_assert(maxsize < (minsize << 16), "maxsize must be less than (minsize<<16)");
60
61 // just a helper to store a triple of K,M,N sizes for matrix product
62 struct size_triple_t
63 {
64 size_t k, m, n;
size_triple_tsize_triple_t65 size_triple_t() : k(0), m(0), n(0) {}
size_triple_tsize_triple_t66 size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
size_triple_tsize_triple_t67 size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
size_triple_tsize_triple_t68 size_triple_t(uint16_t compact)
69 {
70 k = 1 << ((compact & 0xf00) >> 8);
71 m = 1 << ((compact & 0x0f0) >> 4);
72 n = 1 << ((compact & 0x00f) >> 0);
73 }
74 };
75
log2_pot(size_t x)76 uint8_t log2_pot(size_t x) {
77 size_t l = 0;
78 while (x >>= 1) l++;
79 return l;
80 }
81
82 // Convert between size tripes and a compact form fitting in 12 bits
83 // where each size, which must be a POT, is encoded as its log2, on 4 bits
84 // so the largest representable size is 2^15 == 32k ... big enough.
compact_size_triple(size_t k,size_t m,size_t n)85 uint16_t compact_size_triple(size_t k, size_t m, size_t n)
86 {
87 return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
88 }
89
compact_size_triple(const size_triple_t & t)90 uint16_t compact_size_triple(const size_triple_t& t)
91 {
92 return compact_size_triple(t.k, t.m, t.n);
93 }
94
95 // A single benchmark. Initially only contains benchmark params.
96 // Then call run(), which stores the result in the gflops field.
97 struct benchmark_t
98 {
99 uint16_t compact_product_size;
100 uint16_t compact_block_size;
101 bool use_default_block_size;
102 float gflops;
benchmark_tbenchmark_t103 benchmark_t()
104 : compact_product_size(0)
105 , compact_block_size(0)
106 , use_default_block_size(false)
107 , gflops(0)
108 {
109 }
benchmark_tbenchmark_t110 benchmark_t(size_t pk, size_t pm, size_t pn,
111 size_t bk, size_t bm, size_t bn)
112 : compact_product_size(compact_size_triple(pk, pm, pn))
113 , compact_block_size(compact_size_triple(bk, bm, bn))
114 , use_default_block_size(false)
115 , gflops(0)
116 {}
benchmark_tbenchmark_t117 benchmark_t(size_t pk, size_t pm, size_t pn)
118 : compact_product_size(compact_size_triple(pk, pm, pn))
119 , compact_block_size(0)
120 , use_default_block_size(true)
121 , gflops(0)
122 {}
123
124 void run();
125 };
126
operator <<(ostream & s,const benchmark_t & b)127 ostream& operator<<(ostream& s, const benchmark_t& b)
128 {
129 s << hex << b.compact_product_size << dec;
130 if (b.use_default_block_size) {
131 size_triple_t t(b.compact_product_size);
132 Index k = t.k, m = t.m, n = t.n;
133 internal::computeProductBlockingSizes<Scalar, Scalar>(k, m, n);
134 s << " default(" << k << ", " << m << ", " << n << ")";
135 } else {
136 s << " " << hex << b.compact_block_size << dec;
137 }
138 s << " " << b.gflops;
139 return s;
140 }
141
142 // We sort first by increasing benchmark parameters,
143 // then by decreasing performance.
operator <(const benchmark_t & b1,const benchmark_t & b2)144 bool operator<(const benchmark_t& b1, const benchmark_t& b2)
145 {
146 return b1.compact_product_size < b2.compact_product_size ||
147 (b1.compact_product_size == b2.compact_product_size && (
148 (b1.compact_block_size < b2.compact_block_size || (
149 b1.compact_block_size == b2.compact_block_size &&
150 b1.gflops > b2.gflops))));
151 }
152
run()153 void benchmark_t::run()
154 {
155 size_triple_t productsizes(compact_product_size);
156
157 if (use_default_block_size) {
158 eigen_use_specific_block_size = false;
159 } else {
160 // feed eigen with our custom blocking params
161 eigen_use_specific_block_size = true;
162 size_triple_t blocksizes(compact_block_size);
163 eigen_block_size_k = blocksizes.k;
164 eigen_block_size_m = blocksizes.m;
165 eigen_block_size_n = blocksizes.n;
166 }
167
168 // set up the matrix pool
169
170 const size_t combined_three_matrices_sizes =
171 sizeof(Scalar) *
172 (productsizes.k * productsizes.m +
173 productsizes.k * productsizes.n +
174 productsizes.m * productsizes.n);
175
176 // 64 M is large enough that nobody has a cache bigger than that,
177 // while still being small enough that everybody has this much RAM,
178 // so conveniently we don't need to special-case platforms here.
179 const size_t unlikely_large_cache_size = 64 << 20;
180
181 const size_t working_set_size =
182 min_working_set_size ? min_working_set_size : unlikely_large_cache_size;
183
184 const size_t matrix_pool_size =
185 1 + working_set_size / combined_three_matrices_sizes;
186
187 MatrixType *lhs = new MatrixType[matrix_pool_size];
188 MatrixType *rhs = new MatrixType[matrix_pool_size];
189 MatrixType *dst = new MatrixType[matrix_pool_size];
190
191 for (size_t i = 0; i < matrix_pool_size; i++) {
192 lhs[i] = MatrixType::Zero(productsizes.m, productsizes.k);
193 rhs[i] = MatrixType::Zero(productsizes.k, productsizes.n);
194 dst[i] = MatrixType::Zero(productsizes.m, productsizes.n);
195 }
196
197 // main benchmark loop
198
199 int iters_at_a_time = 1;
200 float time_per_iter = 0.0f;
201 size_t matrix_index = 0;
202 while (true) {
203
204 double starttime = timer.getCpuTime();
205 for (int i = 0; i < iters_at_a_time; i++) {
206 dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index];
207 matrix_index++;
208 if (matrix_index == matrix_pool_size) {
209 matrix_index = 0;
210 }
211 }
212 double endtime = timer.getCpuTime();
213
214 const float timing = float(endtime - starttime);
215
216 if (timing >= min_accurate_time) {
217 time_per_iter = timing / iters_at_a_time;
218 break;
219 }
220
221 iters_at_a_time *= 2;
222 }
223
224 delete[] lhs;
225 delete[] rhs;
226 delete[] dst;
227
228 gflops = 2e-9 * productsizes.k * productsizes.m * productsizes.n / time_per_iter;
229 }
230
print_cpuinfo()231 void print_cpuinfo()
232 {
233 #ifdef __linux__
234 cout << "contents of /proc/cpuinfo:" << endl;
235 string line;
236 ifstream cpuinfo("/proc/cpuinfo");
237 if (cpuinfo.is_open()) {
238 while (getline(cpuinfo, line)) {
239 cout << line << endl;
240 }
241 cpuinfo.close();
242 }
243 cout << endl;
244 #elif defined __APPLE__
245 cout << "output of sysctl hw:" << endl;
246 system("sysctl hw");
247 cout << endl;
248 #endif
249 }
250
251 template <typename T>
type_name()252 string type_name()
253 {
254 return "unknown";
255 }
256
257 template<>
type_name()258 string type_name<float>()
259 {
260 return "float";
261 }
262
263 template<>
type_name()264 string type_name<double>()
265 {
266 return "double";
267 }
268
269 struct action_t
270 {
invokation_nameaction_t271 virtual const char* invokation_name() const { abort(); return nullptr; }
runaction_t272 virtual void run() const { abort(); }
~action_taction_t273 virtual ~action_t() {}
274 };
275
show_usage_and_exit(int,char * argv[],const vector<unique_ptr<action_t>> & available_actions)276 void show_usage_and_exit(int /*argc*/, char* argv[],
277 const vector<unique_ptr<action_t>>& available_actions)
278 {
279 cerr << "usage: " << argv[0] << " <action> [options...]" << endl << endl;
280 cerr << "available actions:" << endl << endl;
281 for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
282 cerr << " " << (*it)->invokation_name() << endl;
283 }
284 cerr << endl;
285 cerr << "options:" << endl << endl;
286 cerr << " --min-working-set-size=N:" << endl;
287 cerr << " Set the minimum working set size to N bytes." << endl;
288 cerr << " This is rounded up as needed to a multiple of matrix size." << endl;
289 cerr << " A larger working set lowers the chance of a warm cache." << endl;
290 cerr << " The default value 0 means use a large enough working" << endl;
291 cerr << " set to likely outsize caches." << endl;
292 cerr << " A value of 1 (that is, 1 byte) would mean don't do anything to" << endl;
293 cerr << " avoid warm caches." << endl;
294 exit(1);
295 }
296
measure_clock_speed()297 float measure_clock_speed()
298 {
299 cerr << "Measuring clock speed... \r" << flush;
300
301 vector<float> all_gflops;
302 for (int i = 0; i < 8; i++) {
303 benchmark_t b(1024, 1024, 1024);
304 b.run();
305 all_gflops.push_back(b.gflops);
306 }
307
308 sort(all_gflops.begin(), all_gflops.end());
309 float stable_estimate = all_gflops[2] + all_gflops[3] + all_gflops[4] + all_gflops[5];
310
311 // multiply by an arbitrary constant to discourage trying doing anything with the
312 // returned values besides just comparing them with each other.
313 float result = stable_estimate * 123.456f;
314
315 return result;
316 }
317
318 struct human_duration_t
319 {
320 int seconds;
human_duration_thuman_duration_t321 human_duration_t(int s) : seconds(s) {}
322 };
323
operator <<(ostream & s,const human_duration_t & d)324 ostream& operator<<(ostream& s, const human_duration_t& d)
325 {
326 int remainder = d.seconds;
327 if (remainder > 3600) {
328 int hours = remainder / 3600;
329 s << hours << " h ";
330 remainder -= hours * 3600;
331 }
332 if (remainder > 60) {
333 int minutes = remainder / 60;
334 s << minutes << " min ";
335 remainder -= minutes * 60;
336 }
337 if (d.seconds < 600) {
338 s << remainder << " s";
339 }
340 return s;
341 }
342
343 const char session_filename[] = "/data/local/tmp/benchmark-blocking-sizes-session.data";
344
serialize_benchmarks(const char * filename,const vector<benchmark_t> & benchmarks,size_t first_benchmark_to_run)345 void serialize_benchmarks(const char* filename, const vector<benchmark_t>& benchmarks, size_t first_benchmark_to_run)
346 {
347 FILE* file = fopen(filename, "w");
348 if (!file) {
349 cerr << "Could not open file " << filename << " for writing." << endl;
350 cerr << "Do you have write permissions on the current working directory?" << endl;
351 exit(1);
352 }
353 size_t benchmarks_vector_size = benchmarks.size();
354 fwrite(&max_clock_speed, sizeof(max_clock_speed), 1, file);
355 fwrite(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file);
356 fwrite(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file);
357 fwrite(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file);
358 fclose(file);
359 }
360
deserialize_benchmarks(const char * filename,vector<benchmark_t> & benchmarks,size_t & first_benchmark_to_run)361 bool deserialize_benchmarks(const char* filename, vector<benchmark_t>& benchmarks, size_t& first_benchmark_to_run)
362 {
363 FILE* file = fopen(filename, "r");
364 if (!file) {
365 return false;
366 }
367 if (1 != fread(&max_clock_speed, sizeof(max_clock_speed), 1, file)) {
368 return false;
369 }
370 size_t benchmarks_vector_size = 0;
371 if (1 != fread(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file)) {
372 return false;
373 }
374 if (1 != fread(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file)) {
375 return false;
376 }
377 benchmarks.resize(benchmarks_vector_size);
378 if (benchmarks.size() != fread(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file)) {
379 return false;
380 }
381 unlink(filename);
382 return true;
383 }
384
try_run_some_benchmarks(vector<benchmark_t> & benchmarks,double time_start,size_t & first_benchmark_to_run)385 void try_run_some_benchmarks(
386 vector<benchmark_t>& benchmarks,
387 double time_start,
388 size_t& first_benchmark_to_run)
389 {
390 if (first_benchmark_to_run == benchmarks.size()) {
391 return;
392 }
393
394 double time_last_progress_update = 0;
395 double time_last_clock_speed_measurement = 0;
396 double time_now = 0;
397
398 size_t benchmark_index = first_benchmark_to_run;
399
400 while (true) {
401 float ratio_done = float(benchmark_index) / benchmarks.size();
402 time_now = timer.getRealTime();
403
404 // We check clock speed every minute and at the end.
405 if (benchmark_index == benchmarks.size() ||
406 time_now > time_last_clock_speed_measurement + 60.0f)
407 {
408 time_last_clock_speed_measurement = time_now;
409
410 // Ensure that clock speed is as expected
411 float current_clock_speed = measure_clock_speed();
412
413 // The tolerance needs to be smaller than the relative difference between
414 // clock speeds that a device could operate under.
415 // It seems unlikely that a device would be throttling clock speeds by
416 // amounts smaller than 2%.
417 // With a value of 1%, I was getting within noise on a Sandy Bridge.
418 const float clock_speed_tolerance = 0.02f;
419
420 if (current_clock_speed > (1 + clock_speed_tolerance) * max_clock_speed) {
421 // Clock speed is now higher than we previously measured.
422 // Either our initial measurement was inaccurate, which won't happen
423 // too many times as we are keeping the best clock speed value and
424 // and allowing some tolerance; or something really weird happened,
425 // which invalidates all benchmark results collected so far.
426 // Either way, we better restart all over again now.
427 if (benchmark_index) {
428 cerr << "Restarting at " << 100.0f * ratio_done
429 << " % because clock speed increased. " << endl;
430 }
431 max_clock_speed = current_clock_speed;
432 first_benchmark_to_run = 0;
433 return;
434 }
435
436 bool rerun_last_tests = false;
437
438 if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
439 cerr << "Measurements completed so far: "
440 << 100.0f * ratio_done
441 << " % " << endl;
442 cerr << "Clock speed seems to be only "
443 << current_clock_speed/max_clock_speed
444 << " times what it used to be." << endl;
445
446 unsigned int seconds_to_sleep_if_lower_clock_speed = 1;
447
448 while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
449 if (seconds_to_sleep_if_lower_clock_speed > 32) {
450 cerr << "Sleeping longer probably won't make a difference." << endl;
451 cerr << "Serializing benchmarks to " << session_filename << endl;
452 serialize_benchmarks(session_filename, benchmarks, first_benchmark_to_run);
453 cerr << "Now restart this benchmark, and it should pick up where we left." << endl;
454 exit(2);
455 }
456 rerun_last_tests = true;
457 cerr << "Sleeping "
458 << seconds_to_sleep_if_lower_clock_speed
459 << " s... \r" << endl;
460 sleep(seconds_to_sleep_if_lower_clock_speed);
461 current_clock_speed = measure_clock_speed();
462 seconds_to_sleep_if_lower_clock_speed *= 2;
463 }
464 }
465
466 if (rerun_last_tests) {
467 cerr << "Redoing the last "
468 << 100.0f * float(benchmark_index - first_benchmark_to_run) / benchmarks.size()
469 << " % because clock speed had been low. " << endl;
470 return;
471 }
472
473 // nothing wrong with the clock speed so far, so there won't be a need to rerun
474 // benchmarks run so far in case we later encounter a lower clock speed.
475 first_benchmark_to_run = benchmark_index;
476 }
477
478 if (benchmark_index == benchmarks.size()) {
479 // We're done!
480 first_benchmark_to_run = benchmarks.size();
481 // Erase progress info
482 cerr << " " << endl;
483 return;
484 }
485
486 // Display progress info on stderr
487 if (time_now > time_last_progress_update + 1.0f) {
488 time_last_progress_update = time_now;
489 cerr << "Measurements... " << 100.0f * ratio_done
490 << " %, ETA "
491 << human_duration_t(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done)
492 << " \r" << flush;
493 }
494
495 // This is where we actually run a benchmark!
496 benchmarks[benchmark_index].run();
497 benchmark_index++;
498 }
499 }
500
run_benchmarks(vector<benchmark_t> & benchmarks)501 void run_benchmarks(vector<benchmark_t>& benchmarks)
502 {
503 size_t first_benchmark_to_run;
504 vector<benchmark_t> deserialized_benchmarks;
505 bool use_deserialized_benchmarks = false;
506 if (deserialize_benchmarks(session_filename, deserialized_benchmarks, first_benchmark_to_run)) {
507 cerr << "Found serialized session with "
508 << 100.0f * first_benchmark_to_run / deserialized_benchmarks.size()
509 << " % already done" << endl;
510 if (deserialized_benchmarks.size() == benchmarks.size() &&
511 first_benchmark_to_run > 0 &&
512 first_benchmark_to_run < benchmarks.size())
513 {
514 use_deserialized_benchmarks = true;
515 }
516 }
517
518 if (use_deserialized_benchmarks) {
519 benchmarks = deserialized_benchmarks;
520 } else {
521 // not using deserialized benchmarks, starting from scratch
522 first_benchmark_to_run = 0;
523
524 // Randomly shuffling benchmarks allows us to get accurate enough progress info,
525 // as now the cheap/expensive benchmarks are randomly mixed so they average out.
526 // It also means that if data is corrupted for some time span, the odds are that
527 // not all repetitions of a given benchmark will be corrupted.
528 random_shuffle(benchmarks.begin(), benchmarks.end());
529 }
530
531 for (int i = 0; i < 4; i++) {
532 max_clock_speed = max(max_clock_speed, measure_clock_speed());
533 }
534
535 double time_start = 0.0;
536 while (first_benchmark_to_run < benchmarks.size()) {
537 if (first_benchmark_to_run == 0) {
538 time_start = timer.getRealTime();
539 }
540 try_run_some_benchmarks(benchmarks,
541 time_start,
542 first_benchmark_to_run);
543 }
544
545 // Sort timings by increasing benchmark parameters, and decreasing gflops.
546 // The latter is very important. It means that we can ignore all but the first
547 // benchmark with given parameters.
548 sort(benchmarks.begin(), benchmarks.end());
549
550 // Collect best (i.e. now first) results for each parameter values.
551 vector<benchmark_t> best_benchmarks;
552 for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
553 if (best_benchmarks.empty() ||
554 best_benchmarks.back().compact_product_size != it->compact_product_size ||
555 best_benchmarks.back().compact_block_size != it->compact_block_size)
556 {
557 best_benchmarks.push_back(*it);
558 }
559 }
560
561 // keep and return only the best benchmarks
562 benchmarks = best_benchmarks;
563 }
564
565 struct measure_all_pot_sizes_action_t : action_t
566 {
invokation_namemeasure_all_pot_sizes_action_t567 virtual const char* invokation_name() const { return "all-pot-sizes"; }
runmeasure_all_pot_sizes_action_t568 virtual void run() const
569 {
570 vector<benchmark_t> benchmarks;
571 for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
572 for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
573 for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
574 for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
575 for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) {
576 for (size_t mblock = minsize; mblock <= msize; mblock *= 2) {
577 for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) {
578 benchmarks.emplace_back(ksize, msize, nsize, kblock, mblock, nblock);
579 }
580 }
581 }
582 }
583 }
584 }
585 }
586
587 run_benchmarks(benchmarks);
588
589 cout << "BEGIN MEASUREMENTS ALL POT SIZES" << endl;
590 for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
591 cout << *it << endl;
592 }
593 }
594 };
595
596 struct measure_default_sizes_action_t : action_t
597 {
invokation_namemeasure_default_sizes_action_t598 virtual const char* invokation_name() const { return "default-sizes"; }
runmeasure_default_sizes_action_t599 virtual void run() const
600 {
601 vector<benchmark_t> benchmarks;
602 for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
603 for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
604 for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
605 for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
606 benchmarks.emplace_back(ksize, msize, nsize);
607 }
608 }
609 }
610 }
611
612 run_benchmarks(benchmarks);
613
614 cout << "BEGIN MEASUREMENTS DEFAULT SIZES" << endl;
615 for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
616 cout << *it << endl;
617 }
618 }
619 };
620
main(int argc,char * argv[])621 int main(int argc, char* argv[])
622 {
623 double time_start = timer.getRealTime();
624 cout.precision(4);
625 cerr.precision(4);
626
627 vector<unique_ptr<action_t>> available_actions;
628 available_actions.emplace_back(new measure_all_pot_sizes_action_t);
629 available_actions.emplace_back(new measure_default_sizes_action_t);
630
631 auto action = available_actions.end();
632
633 if (argc <= 1) {
634 show_usage_and_exit(argc, argv, available_actions);
635 }
636 for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
637 if (!strcmp(argv[1], (*it)->invokation_name())) {
638 action = it;
639 break;
640 }
641 }
642
643 if (action == available_actions.end()) {
644 show_usage_and_exit(argc, argv, available_actions);
645 }
646
647 for (int i = 2; i < argc; i++) {
648 if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
649 const char* equals_sign = strchr(argv[i], '=');
650 min_working_set_size = strtoul(equals_sign+1, nullptr, 10);
651 } else {
652 cerr << "unrecognized option: " << argv[i] << endl << endl;
653 show_usage_and_exit(argc, argv, available_actions);
654 }
655 }
656
657 print_cpuinfo();
658
659 cout << "benchmark parameters:" << endl;
660 cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl;
661 cout << "scalar type: " << type_name<Scalar>() << endl;
662 cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
663 cout << "minsize = " << minsize << endl;
664 cout << "maxsize = " << maxsize << endl;
665 cout << "measurement_repetitions = " << measurement_repetitions << endl;
666 cout << "min_accurate_time = " << min_accurate_time << endl;
667 cout << "min_working_set_size = " << min_working_set_size;
668 if (min_working_set_size == 0) {
669 cout << " (try to outsize caches)";
670 }
671 cout << endl << endl;
672
673 (*action)->run();
674
675 double time_end = timer.getRealTime();
676 cerr << "Finished in " << human_duration_t(time_end - time_start) << endl;
677 }
678