1 /*
2 * Microbenchmark for math functions.
3 *
4 * Copyright (c) 2018-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8 #undef _GNU_SOURCE
9 #define _GNU_SOURCE 1
10 #include <stdint.h>
11 #include <stdlib.h>
12 #include <stdio.h>
13 #include <string.h>
14 #include <time.h>
15 #include <math.h>
16 #include "mathlib.h"
17
18 #ifndef WANT_VMATH
19 /* Enable the build of vector math code. */
20 # define WANT_VMATH 1
21 #endif
22
23 /* Number of measurements, best result is reported. */
24 #define MEASURE 60
25 /* Array size. */
26 #define N 8000
27 /* Iterations over the array. */
28 #define ITER 125
29
30 static double *Trace;
31 static size_t trace_size;
32 static double A[N];
33 static float Af[N];
34 static long measurecount = MEASURE;
35 static long itercount = ITER;
36
37 #if __aarch64__ && WANT_VMATH
38 typedef __f64x2_t v_double;
39
40 #define v_double_len() 2
41
42 static inline v_double
v_double_load(const double * p)43 v_double_load (const double *p)
44 {
45 return (v_double){p[0], p[1]};
46 }
47
48 static inline v_double
v_double_dup(double x)49 v_double_dup (double x)
50 {
51 return (v_double){x, x};
52 }
53
54 typedef __f32x4_t v_float;
55
56 #define v_float_len() 4
57
58 static inline v_float
v_float_load(const float * p)59 v_float_load (const float *p)
60 {
61 return (v_float){p[0], p[1], p[2], p[3]};
62 }
63
64 static inline v_float
v_float_dup(float x)65 v_float_dup (float x)
66 {
67 return (v_float){x, x, x, x};
68 }
69 #if WANT_SVE_MATH
70 #include <arm_sve.h>
71 typedef svbool_t sv_bool;
72 typedef svfloat64_t sv_double;
73
74 #define sv_double_len() svcntd()
75
76 static inline sv_double
sv_double_load(const double * p)77 sv_double_load (const double *p)
78 {
79 svbool_t pg = svptrue_b64();
80 return svld1(pg, p);
81 }
82
83 static inline sv_double
sv_double_dup(double x)84 sv_double_dup (double x)
85 {
86 return svdup_n_f64(x);
87 }
88
89 typedef svfloat32_t sv_float;
90
91 #define sv_float_len() svcntw()
92
93 static inline sv_float
sv_float_load(const float * p)94 sv_float_load (const float *p)
95 {
96 svbool_t pg = svptrue_b32();
97 return svld1(pg, p);
98 }
99
100 static inline sv_float
sv_float_dup(float x)101 sv_float_dup (float x)
102 {
103 return svdup_n_f32(x);
104 }
105 #endif
106 #else
107 /* dummy definitions to make things compile. */
108 typedef double v_double;
109 typedef float v_float;
110 #define v_double_len(x) 1
111 #define v_double_load(x) (x)[0]
112 #define v_double_dup(x) (x)
113 #define v_float_len(x) 1
114 #define v_float_load(x) (x)[0]
115 #define v_float_dup(x) (x)
116 #endif
117
118 static double
dummy(double x)119 dummy (double x)
120 {
121 return x;
122 }
123
124 static float
dummyf(float x)125 dummyf (float x)
126 {
127 return x;
128 }
129 #if WANT_VMATH
130 #if __aarch64__
131 static v_double
__v_dummy(v_double x)132 __v_dummy (v_double x)
133 {
134 return x;
135 }
136
137 static v_float
__v_dummyf(v_float x)138 __v_dummyf (v_float x)
139 {
140 return x;
141 }
142
143 #ifdef __vpcs
144 __vpcs static v_double
__vn_dummy(v_double x)145 __vn_dummy (v_double x)
146 {
147 return x;
148 }
149
150 __vpcs static v_float
__vn_dummyf(v_float x)151 __vn_dummyf (v_float x)
152 {
153 return x;
154 }
155 #endif
156 #if WANT_SVE_MATH
157 static sv_double
__sv_dummy(sv_double x,sv_bool pg)158 __sv_dummy (sv_double x, sv_bool pg)
159 {
160 return x;
161 }
162
163 static sv_float
__sv_dummyf(sv_float x,sv_bool pg)164 __sv_dummyf (sv_float x, sv_bool pg)
165 {
166 return x;
167 }
168
169 #endif
170 #endif
171 #endif
172
173 #include "test/mathbench_wrappers.h"
174
175 static const struct fun
176 {
177 const char *name;
178 int prec;
179 int vec;
180 double lo;
181 double hi;
182 union
183 {
184 double (*d) (double);
185 float (*f) (float);
186 v_double (*vd) (v_double);
187 v_float (*vf) (v_float);
188 #ifdef __vpcs
189 __vpcs v_double (*vnd) (v_double);
190 __vpcs v_float (*vnf) (v_float);
191 #endif
192 #if WANT_SVE_MATH
193 sv_double (*svd) (sv_double, sv_bool);
194 sv_float (*svf) (sv_float, sv_bool);
195 #endif
196 } fun;
197 } funtab[] = {
198 #define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}},
199 #define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}},
200 #define VD(func, lo, hi) {#func, 'd', 'v', lo, hi, {.vd = func}},
201 #define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}},
202 #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},
203 #define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}},
204 #define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}},
205 #define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}},
206 D (dummy, 1.0, 2.0)
207 F (dummyf, 1.0, 2.0)
208 #if WANT_VMATH
209 #if __aarch64__
210 VD (__v_dummy, 1.0, 2.0)
211 VF (__v_dummyf, 1.0, 2.0)
212 #ifdef __vpcs
213 VND (__vn_dummy, 1.0, 2.0)
214 VNF (__vn_dummyf, 1.0, 2.0)
215 #endif
216 #if WANT_SVE_MATH
217 SVD (__sv_dummy, 1.0, 2.0)
218 SVF (__sv_dummyf, 1.0, 2.0)
219 #endif
220 #endif
221 #endif
222 #include "test/mathbench_funcs.h"
223 {0},
224 #undef F
225 #undef D
226 #undef VF
227 #undef VD
228 #undef VNF
229 #undef VND
230 #undef SVF
231 #undef SVD
232 };
233
234 static void
gen_linear(double lo,double hi)235 gen_linear (double lo, double hi)
236 {
237 for (int i = 0; i < N; i++)
238 A[i] = (lo * (N - i) + hi * i) / N;
239 }
240
241 static void
genf_linear(double lo,double hi)242 genf_linear (double lo, double hi)
243 {
244 for (int i = 0; i < N; i++)
245 Af[i] = (float)(lo * (N - i) + hi * i) / N;
246 }
247
248 static inline double
asdouble(uint64_t i)249 asdouble (uint64_t i)
250 {
251 union
252 {
253 uint64_t i;
254 double f;
255 } u = {i};
256 return u.f;
257 }
258
259 static uint64_t seed = 0x0123456789abcdef;
260
261 static double
frand(double lo,double hi)262 frand (double lo, double hi)
263 {
264 seed = 6364136223846793005ULL * seed + 1;
265 return lo + (hi - lo) * (asdouble (seed >> 12 | 0x3ffULL << 52) - 1.0);
266 }
267
268 static void
gen_rand(double lo,double hi)269 gen_rand (double lo, double hi)
270 {
271 for (int i = 0; i < N; i++)
272 A[i] = frand (lo, hi);
273 }
274
275 static void
genf_rand(double lo,double hi)276 genf_rand (double lo, double hi)
277 {
278 for (int i = 0; i < N; i++)
279 Af[i] = (float)frand (lo, hi);
280 }
281
282 static void
gen_trace(int index)283 gen_trace (int index)
284 {
285 for (int i = 0; i < N; i++)
286 A[i] = Trace[index + i];
287 }
288
289 static void
genf_trace(int index)290 genf_trace (int index)
291 {
292 for (int i = 0; i < N; i++)
293 Af[i] = (float)Trace[index + i];
294 }
295
296 static void
run_thruput(double f (double))297 run_thruput (double f (double))
298 {
299 for (int i = 0; i < N; i++)
300 f (A[i]);
301 }
302
303 static void
runf_thruput(float f (float))304 runf_thruput (float f (float))
305 {
306 for (int i = 0; i < N; i++)
307 f (Af[i]);
308 }
309
310 volatile double zero = 0;
311
312 static void
run_latency(double f (double))313 run_latency (double f (double))
314 {
315 double z = zero;
316 double prev = z;
317 for (int i = 0; i < N; i++)
318 prev = f (A[i] + prev * z);
319 }
320
321 static void
runf_latency(float f (float))322 runf_latency (float f (float))
323 {
324 float z = (float)zero;
325 float prev = z;
326 for (int i = 0; i < N; i++)
327 prev = f (Af[i] + prev * z);
328 }
329
330 static void
run_v_thruput(v_double f (v_double))331 run_v_thruput (v_double f (v_double))
332 {
333 for (int i = 0; i < N; i += v_double_len ())
334 f (v_double_load (A+i));
335 }
336
337 static void
runf_v_thruput(v_float f (v_float))338 runf_v_thruput (v_float f (v_float))
339 {
340 for (int i = 0; i < N; i += v_float_len ())
341 f (v_float_load (Af+i));
342 }
343
344 static void
run_v_latency(v_double f (v_double))345 run_v_latency (v_double f (v_double))
346 {
347 v_double z = v_double_dup (zero);
348 v_double prev = z;
349 for (int i = 0; i < N; i += v_double_len ())
350 prev = f (v_double_load (A+i) + prev * z);
351 }
352
353 static void
runf_v_latency(v_float f (v_float))354 runf_v_latency (v_float f (v_float))
355 {
356 v_float z = v_float_dup (zero);
357 v_float prev = z;
358 for (int i = 0; i < N; i += v_float_len ())
359 prev = f (v_float_load (Af+i) + prev * z);
360 }
361
362 #ifdef __vpcs
363 static void
run_vn_thruput(__vpcs v_double f (v_double))364 run_vn_thruput (__vpcs v_double f (v_double))
365 {
366 for (int i = 0; i < N; i += v_double_len ())
367 f (v_double_load (A+i));
368 }
369
370 static void
runf_vn_thruput(__vpcs v_float f (v_float))371 runf_vn_thruput (__vpcs v_float f (v_float))
372 {
373 for (int i = 0; i < N; i += v_float_len ())
374 f (v_float_load (Af+i));
375 }
376
377 static void
run_vn_latency(__vpcs v_double f (v_double))378 run_vn_latency (__vpcs v_double f (v_double))
379 {
380 v_double z = v_double_dup (zero);
381 v_double prev = z;
382 for (int i = 0; i < N; i += v_double_len ())
383 prev = f (v_double_load (A+i) + prev * z);
384 }
385
386 static void
runf_vn_latency(__vpcs v_float f (v_float))387 runf_vn_latency (__vpcs v_float f (v_float))
388 {
389 v_float z = v_float_dup (zero);
390 v_float prev = z;
391 for (int i = 0; i < N; i += v_float_len ())
392 prev = f (v_float_load (Af+i) + prev * z);
393 }
394 #endif
395
396 #if WANT_SVE_MATH
397 static void
run_sv_thruput(sv_double f (sv_double,sv_bool))398 run_sv_thruput (sv_double f (sv_double, sv_bool))
399 {
400 for (int i = 0; i < N; i += sv_double_len ())
401 f (sv_double_load (A+i), svptrue_b64 ());
402 }
403
404 static void
runf_sv_thruput(sv_float f (sv_float,sv_bool))405 runf_sv_thruput (sv_float f (sv_float, sv_bool))
406 {
407 for (int i = 0; i < N; i += sv_float_len ())
408 f (sv_float_load (Af+i), svptrue_b32 ());
409 }
410
411 static void
run_sv_latency(sv_double f (sv_double,sv_bool))412 run_sv_latency (sv_double f (sv_double, sv_bool))
413 {
414 sv_double z = sv_double_dup (zero);
415 sv_double prev = z;
416 for (int i = 0; i < N; i += sv_double_len ())
417 prev = f (svmad_f64_x (svptrue_b64 (), prev, z, sv_double_load (A+i)), svptrue_b64 ());
418 }
419
420 static void
runf_sv_latency(sv_float f (sv_float,sv_bool))421 runf_sv_latency (sv_float f (sv_float, sv_bool))
422 {
423 sv_float z = sv_float_dup (zero);
424 sv_float prev = z;
425 for (int i = 0; i < N; i += sv_float_len ())
426 prev = f (svmad_f32_x (svptrue_b32 (), prev, z, sv_float_load (Af+i)), svptrue_b32 ());
427 }
428 #endif
429
430 static uint64_t
tic(void)431 tic (void)
432 {
433 struct timespec ts;
434 if (clock_gettime (CLOCK_REALTIME, &ts))
435 abort ();
436 return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
437 }
438
439 #define TIMEIT(run, f) do { \
440 dt = -1; \
441 run (f); /* Warm up. */ \
442 for (int j = 0; j < measurecount; j++) \
443 { \
444 uint64_t t0 = tic (); \
445 for (int i = 0; i < itercount; i++) \
446 run (f); \
447 uint64_t t1 = tic (); \
448 if (t1 - t0 < dt) \
449 dt = t1 - t0; \
450 } \
451 } while (0)
452
453 static void
bench1(const struct fun * f,int type,double lo,double hi)454 bench1 (const struct fun *f, int type, double lo, double hi)
455 {
456 uint64_t dt = 0;
457 uint64_t ns100;
458 const char *s = type == 't' ? "rthruput" : "latency";
459 int vlen = 1;
460
461 if (f->vec && f->prec == 'd')
462 vlen = v_double_len();
463 else if (f->vec && f->prec == 'f')
464 vlen = v_float_len();
465
466 if (f->prec == 'd' && type == 't' && f->vec == 0)
467 TIMEIT (run_thruput, f->fun.d);
468 else if (f->prec == 'd' && type == 'l' && f->vec == 0)
469 TIMEIT (run_latency, f->fun.d);
470 else if (f->prec == 'f' && type == 't' && f->vec == 0)
471 TIMEIT (runf_thruput, f->fun.f);
472 else if (f->prec == 'f' && type == 'l' && f->vec == 0)
473 TIMEIT (runf_latency, f->fun.f);
474 else if (f->prec == 'd' && type == 't' && f->vec == 'v')
475 TIMEIT (run_v_thruput, f->fun.vd);
476 else if (f->prec == 'd' && type == 'l' && f->vec == 'v')
477 TIMEIT (run_v_latency, f->fun.vd);
478 else if (f->prec == 'f' && type == 't' && f->vec == 'v')
479 TIMEIT (runf_v_thruput, f->fun.vf);
480 else if (f->prec == 'f' && type == 'l' && f->vec == 'v')
481 TIMEIT (runf_v_latency, f->fun.vf);
482 #ifdef __vpcs
483 else if (f->prec == 'd' && type == 't' && f->vec == 'n')
484 TIMEIT (run_vn_thruput, f->fun.vnd);
485 else if (f->prec == 'd' && type == 'l' && f->vec == 'n')
486 TIMEIT (run_vn_latency, f->fun.vnd);
487 else if (f->prec == 'f' && type == 't' && f->vec == 'n')
488 TIMEIT (runf_vn_thruput, f->fun.vnf);
489 else if (f->prec == 'f' && type == 'l' && f->vec == 'n')
490 TIMEIT (runf_vn_latency, f->fun.vnf);
491 #endif
492 #if WANT_SVE_MATH
493 else if (f->prec == 'd' && type == 't' && f->vec == 's')
494 TIMEIT (run_sv_thruput, f->fun.svd);
495 else if (f->prec == 'd' && type == 'l' && f->vec == 's')
496 TIMEIT (run_sv_latency, f->fun.svd);
497 else if (f->prec == 'f' && type == 't' && f->vec == 's')
498 TIMEIT (runf_sv_thruput, f->fun.svf);
499 else if (f->prec == 'f' && type == 'l' && f->vec == 's')
500 TIMEIT (runf_sv_latency, f->fun.svf);
501 #endif
502
503 if (type == 't')
504 {
505 ns100 = (100 * dt + itercount * N / 2) / (itercount * N);
506 printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s,
507 (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
508 (unsigned long long) dt, lo, hi);
509 }
510 else if (type == 'l')
511 {
512 ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen);
513 printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g]\n", f->name, s,
514 (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
515 (unsigned long long) dt, lo, hi);
516 }
517 fflush (stdout);
518 }
519
520 static void
bench(const struct fun * f,double lo,double hi,int type,int gen)521 bench (const struct fun *f, double lo, double hi, int type, int gen)
522 {
523 if (f->prec == 'd' && gen == 'r')
524 gen_rand (lo, hi);
525 else if (f->prec == 'd' && gen == 'l')
526 gen_linear (lo, hi);
527 else if (f->prec == 'd' && gen == 't')
528 gen_trace (0);
529 else if (f->prec == 'f' && gen == 'r')
530 genf_rand (lo, hi);
531 else if (f->prec == 'f' && gen == 'l')
532 genf_linear (lo, hi);
533 else if (f->prec == 'f' && gen == 't')
534 genf_trace (0);
535
536 if (gen == 't')
537 hi = trace_size / N;
538
539 if (type == 'b' || type == 't')
540 bench1 (f, 't', lo, hi);
541
542 if (type == 'b' || type == 'l')
543 bench1 (f, 'l', lo, hi);
544
545 for (int i = N; i < trace_size; i += N)
546 {
547 if (f->prec == 'd')
548 gen_trace (i);
549 else
550 genf_trace (i);
551
552 lo = i / N;
553 if (type == 'b' || type == 't')
554 bench1 (f, 't', lo, hi);
555
556 if (type == 'b' || type == 'l')
557 bench1 (f, 'l', lo, hi);
558 }
559 }
560
561 static void
readtrace(const char * name)562 readtrace (const char *name)
563 {
564 int n = 0;
565 FILE *f = strcmp (name, "-") == 0 ? stdin : fopen (name, "r");
566 if (!f)
567 {
568 printf ("openning \"%s\" failed: %m\n", name);
569 exit (1);
570 }
571 for (;;)
572 {
573 if (n >= trace_size)
574 {
575 trace_size += N;
576 Trace = realloc (Trace, trace_size * sizeof (Trace[0]));
577 if (Trace == NULL)
578 {
579 printf ("out of memory\n");
580 exit (1);
581 }
582 }
583 if (fscanf (f, "%lf", Trace + n) != 1)
584 break;
585 n++;
586 }
587 if (ferror (f) || n == 0)
588 {
589 printf ("reading \"%s\" failed: %m\n", name);
590 exit (1);
591 }
592 fclose (f);
593 if (n % N == 0)
594 trace_size = n;
595 for (int i = 0; n < trace_size; n++, i++)
596 Trace[n] = Trace[i];
597 }
598
599 static void
usage(void)600 usage (void)
601 {
602 printf ("usage: ./mathbench [-g rand|linear|trace] [-t latency|thruput|both] "
603 "[-i low high] [-f tracefile] [-m measurements] [-c iterations] func "
604 "[func2 ..]\n");
605 printf ("func:\n");
606 printf ("%7s [run all benchmarks]\n", "all");
607 for (const struct fun *f = funtab; f->name; f++)
608 printf ("%7s [low: %g high: %g]\n", f->name, f->lo, f->hi);
609 exit (1);
610 }
611
612 int
main(int argc,char * argv[])613 main (int argc, char *argv[])
614 {
615 int usergen = 0, gen = 'r', type = 'b', all = 0;
616 double lo = 0, hi = 0;
617 const char *tracefile = "-";
618
619 argv++;
620 argc--;
621 for (;;)
622 {
623 if (argc <= 0)
624 usage ();
625 if (argv[0][0] != '-')
626 break;
627 else if (argc >= 3 && strcmp (argv[0], "-i") == 0)
628 {
629 usergen = 1;
630 lo = strtod (argv[1], 0);
631 hi = strtod (argv[2], 0);
632 argv += 3;
633 argc -= 3;
634 }
635 else if (argc >= 2 && strcmp (argv[0], "-m") == 0)
636 {
637 measurecount = strtol (argv[1], 0, 0);
638 argv += 2;
639 argc -= 2;
640 }
641 else if (argc >= 2 && strcmp (argv[0], "-c") == 0)
642 {
643 itercount = strtol (argv[1], 0, 0);
644 argv += 2;
645 argc -= 2;
646 }
647 else if (argc >= 2 && strcmp (argv[0], "-g") == 0)
648 {
649 gen = argv[1][0];
650 if (strchr ("rlt", gen) == 0)
651 usage ();
652 argv += 2;
653 argc -= 2;
654 }
655 else if (argc >= 2 && strcmp (argv[0], "-f") == 0)
656 {
657 gen = 't'; /* -f implies -g trace. */
658 tracefile = argv[1];
659 argv += 2;
660 argc -= 2;
661 }
662 else if (argc >= 2 && strcmp (argv[0], "-t") == 0)
663 {
664 type = argv[1][0];
665 if (strchr ("ltb", type) == 0)
666 usage ();
667 argv += 2;
668 argc -= 2;
669 }
670 else
671 usage ();
672 }
673 if (gen == 't')
674 {
675 readtrace (tracefile);
676 lo = hi = 0;
677 usergen = 1;
678 }
679 while (argc > 0)
680 {
681 int found = 0;
682 all = strcmp (argv[0], "all") == 0;
683 for (const struct fun *f = funtab; f->name; f++)
684 if (all || strcmp (argv[0], f->name) == 0)
685 {
686 found = 1;
687 if (!usergen)
688 {
689 lo = f->lo;
690 hi = f->hi;
691 }
692 bench (f, lo, hi, type, gen);
693 if (usergen && !all)
694 break;
695 }
696 if (!found)
697 printf ("unknown function: %s\n", argv[0]);
698 argv++;
699 argc--;
700 }
701 return 0;
702 }
703