1 /*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2018, Two Orioles, LLC
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 * list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 * this list of conditions and the following disclaimer in the documentation
14 * and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 #include "tests/checkasm/checkasm.h"
28
29 #include <errno.h>
30 #include <math.h>
31 #include <signal.h>
32 #include <stdarg.h>
33 #include <stdio.h>
34 #include <string.h>
35
36 #include "src/cpu.h"
37
38 #ifdef _WIN32
39 #ifndef SIGBUS
40 /* non-standard, use the same value as mingw-w64 */
41 #define SIGBUS 10
42 #endif
43 #ifndef ENABLE_VIRTUAL_TERMINAL_PROCESSING
44 #define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x04
45 #endif
46 #else
47 #include <unistd.h>
48 #include <time.h>
49 #include <pthread.h>
50 #ifdef HAVE_PTHREAD_NP_H
51 #include <pthread_np.h>
52 #endif
53 #ifdef __APPLE__
54 #include <mach/mach_time.h>
55 #endif
56 #endif
57 #if CONFIG_MACOS_KPERF
58 #include <dlfcn.h>
59 #endif
60
61 #define COLOR_RED 31
62 #define COLOR_GREEN 32
63 #define COLOR_YELLOW 33
64
65 /* List of tests to invoke */
66 static const struct {
67 const char *name;
68 void (*func)(void);
69 } tests[] = {
70 { "msac", checkasm_check_msac },
71 { "pal", checkasm_check_pal },
72 { "refmvs", checkasm_check_refmvs },
73 #if CONFIG_8BPC
74 { "cdef_8bpc", checkasm_check_cdef_8bpc },
75 { "filmgrain_8bpc", checkasm_check_filmgrain_8bpc },
76 { "ipred_8bpc", checkasm_check_ipred_8bpc },
77 { "itx_8bpc", checkasm_check_itx_8bpc },
78 { "loopfilter_8bpc", checkasm_check_loopfilter_8bpc },
79 { "looprestoration_8bpc", checkasm_check_looprestoration_8bpc },
80 { "mc_8bpc", checkasm_check_mc_8bpc },
81 #endif
82 #if CONFIG_16BPC
83 { "cdef_16bpc", checkasm_check_cdef_16bpc },
84 { "filmgrain_16bpc", checkasm_check_filmgrain_16bpc },
85 { "ipred_16bpc", checkasm_check_ipred_16bpc },
86 { "itx_16bpc", checkasm_check_itx_16bpc },
87 { "loopfilter_16bpc", checkasm_check_loopfilter_16bpc },
88 { "looprestoration_16bpc", checkasm_check_looprestoration_16bpc },
89 { "mc_16bpc", checkasm_check_mc_16bpc },
90 #endif
91 { 0 }
92 };
93
94 /* List of cpu flags to check */
95 static const struct {
96 const char *name;
97 const char *suffix;
98 unsigned flag;
99 } cpus[] = {
100 #if ARCH_X86
101 { "SSE2", "sse2", DAV1D_X86_CPU_FLAG_SSE2 },
102 { "SSSE3", "ssse3", DAV1D_X86_CPU_FLAG_SSSE3 },
103 { "SSE4.1", "sse4", DAV1D_X86_CPU_FLAG_SSE41 },
104 { "AVX2", "avx2", DAV1D_X86_CPU_FLAG_AVX2 },
105 { "AVX-512 (Ice Lake)", "avx512icl", DAV1D_X86_CPU_FLAG_AVX512ICL },
106 #elif ARCH_AARCH64 || ARCH_ARM
107 { "NEON", "neon", DAV1D_ARM_CPU_FLAG_NEON },
108 { "DOTPROD", "dotprod", DAV1D_ARM_CPU_FLAG_DOTPROD },
109 { "I8MM", "i8mm", DAV1D_ARM_CPU_FLAG_I8MM },
110 #if ARCH_AARCH64
111 { "SVE", "sve", DAV1D_ARM_CPU_FLAG_SVE },
112 { "SVE2", "sve2", DAV1D_ARM_CPU_FLAG_SVE2 },
113 #endif /* ARCH_AARCH64 */
114 #elif ARCH_LOONGARCH
115 { "LSX", "lsx", DAV1D_LOONGARCH_CPU_FLAG_LSX },
116 { "LASX", "lasx", DAV1D_LOONGARCH_CPU_FLAG_LASX },
117 #elif ARCH_PPC64LE
118 { "VSX", "vsx", DAV1D_PPC_CPU_FLAG_VSX },
119 { "PWR9", "pwr9", DAV1D_PPC_CPU_FLAG_PWR9 },
120 #elif ARCH_RISCV
121 { "RVV", "rvv", DAV1D_RISCV_CPU_FLAG_V },
122 #endif
123 { 0 }
124 };
125
126 #if ARCH_AARCH64 && HAVE_SVE
127 int checkasm_sve_length(void);
128 #elif ARCH_RISCV
129 int checkasm_get_vlenb(void);
130 #endif
131
132 typedef struct CheckasmFuncVersion {
133 struct CheckasmFuncVersion *next;
134 void *func;
135 int ok;
136 unsigned cpu;
137 int iterations;
138 uint64_t cycles;
139 } CheckasmFuncVersion;
140
141 /* Binary search tree node */
142 typedef struct CheckasmFunc {
143 struct CheckasmFunc *child[2];
144 CheckasmFuncVersion versions;
145 uint8_t color; /* 0 = red, 1 = black */
146 char name[];
147 } CheckasmFunc;
148
149 typedef enum {
150 RUN_NORMAL = 0,
151 RUN_BENCHMARK,
152 RUN_CPUFLAG_LISTING,
153 RUN_FUNCTION_LISTING,
154 } CheckasmRunMode;
155
156 /* Internal state */
157 static struct {
158 CheckasmFunc *funcs;
159 CheckasmFunc *current_func;
160 CheckasmFuncVersion *current_func_ver;
161 const char *current_test_name;
162 int num_checked;
163 int num_failed;
164 double nop_time;
165 unsigned cpu_flag;
166 const char *cpu_flag_name;
167 const char *test_pattern;
168 const char *function_pattern;
169 unsigned seed;
170 CheckasmRunMode run_mode;
171 int verbose;
172 volatile sig_atomic_t sig; // SIG_ATOMIC_MAX = signal handling enabled
173 int suffix_length;
174 int max_function_name_length;
175 #if ARCH_X86_64
176 void (*simd_warmup)(void);
177 #endif
178 } state;
179
180 /* float compare support code */
181 typedef union {
182 float f;
183 uint32_t i;
184 } intfloat;
185
186 static uint32_t xs_state[4];
187
xor128_srand(unsigned seed)188 static void xor128_srand(unsigned seed) {
189 xs_state[0] = seed;
190 xs_state[1] = ( seed & 0xffff0000) | (~seed & 0x0000ffff);
191 xs_state[2] = (~seed & 0xffff0000) | ( seed & 0x0000ffff);
192 xs_state[3] = ~seed;
193 }
194
195 // xor128 from Marsaglia, George (July 2003). "Xorshift RNGs".
196 // Journal of Statistical Software. 8 (14).
197 // doi:10.18637/jss.v008.i14.
xor128_rand(void)198 int xor128_rand(void) {
199 const uint32_t x = xs_state[0];
200 const uint32_t t = x ^ (x << 11);
201
202 xs_state[0] = xs_state[1];
203 xs_state[1] = xs_state[2];
204 xs_state[2] = xs_state[3];
205 uint32_t w = xs_state[3];
206
207 w = (w ^ (w >> 19)) ^ (t ^ (t >> 8));
208 xs_state[3] = w;
209
210 return w >> 1;
211 }
212
213 #if CONFIG_MACOS_KPERF
214
215 static int (*kpc_get_thread_counters)(int, unsigned int, void *);
216
217 #define CFGWORD_EL0A64EN_MASK (0x20000)
218
219 #define CPMU_CORE_CYCLE 0x02
220
221 #define KPC_CLASS_FIXED_MASK (1 << 0)
222 #define KPC_CLASS_CONFIGURABLE_MASK (1 << 1)
223
224 #define COUNTERS_COUNT 10
225 #define CONFIG_COUNT 8
226 #define KPC_MASK (KPC_CLASS_CONFIGURABLE_MASK | KPC_CLASS_FIXED_MASK)
227
kperf_init(void)228 static int kperf_init(void) {
229 uint64_t config[COUNTERS_COUNT] = { 0 };
230
231 void *kperf = dlopen("/System/Library/PrivateFrameworks/kperf.framework/kperf", RTLD_LAZY);
232 if (!kperf) {
233 fprintf(stderr, "checkasm: Unable to load kperf: %s\n", dlerror());
234 return 1;
235 }
236
237 int (*kpc_force_all_ctrs_set)(int) = dlsym(kperf, "kpc_force_all_ctrs_set");
238 int (*kpc_set_counting)(uint32_t) = dlsym(kperf, "kpc_set_counting");
239 int (*kpc_set_thread_counting)(uint32_t) = dlsym(kperf, "kpc_set_thread_counting");
240 int (*kpc_set_config)(uint32_t, void *) = dlsym(kperf, "kpc_set_config");
241 uint32_t (*kpc_get_counter_count)(uint32_t) = dlsym(kperf, "kpc_get_counter_count");
242 uint32_t (*kpc_get_config_count)(uint32_t) = dlsym(kperf, "kpc_get_config_count");
243 kpc_get_thread_counters = dlsym(kperf, "kpc_get_thread_counters");
244
245 if (!kpc_get_thread_counters) {
246 fprintf(stderr, "checkasm: Unable to load kpc_get_thread_counters\n");
247 return 1;
248 }
249
250 if (!kpc_get_counter_count || kpc_get_counter_count(KPC_MASK) != COUNTERS_COUNT) {
251 fprintf(stderr, "checkasm: Unxpected kpc_get_counter_count\n");
252 return 1;
253 }
254 if (!kpc_get_config_count || kpc_get_config_count(KPC_MASK) != CONFIG_COUNT) {
255 fprintf(stderr, "checkasm: Unxpected kpc_get_config_count\n");
256 return 1;
257 }
258
259 config[0] = CPMU_CORE_CYCLE | CFGWORD_EL0A64EN_MASK;
260
261 if (!kpc_set_config || kpc_set_config(KPC_MASK, config)) {
262 fprintf(stderr, "checkasm: The kperf API needs to be run as root\n");
263 return 1;
264 }
265 if (!kpc_force_all_ctrs_set || kpc_force_all_ctrs_set(1)) {
266 fprintf(stderr, "checkasm: kpc_force_all_ctrs_set failed\n");
267 return 1;
268 }
269 if (!kpc_set_counting || kpc_set_counting(KPC_MASK)) {
270 fprintf(stderr, "checkasm: kpc_set_counting failed\n");
271 return 1;
272 }
273 if (!kpc_set_counting || kpc_set_thread_counting(KPC_MASK)) {
274 fprintf(stderr, "checkasm: kpc_set_thread_counting failed\n");
275 return 1;
276 }
277 return 0;
278 }
279
checkasm_kperf_cycles(void)280 uint64_t checkasm_kperf_cycles(void) {
281 uint64_t counters[COUNTERS_COUNT];
282 if (kpc_get_thread_counters(0, COUNTERS_COUNT, counters))
283 return -1;
284
285 return counters[0];
286 }
287 #endif
288
is_negative(const intfloat u)289 static int is_negative(const intfloat u) {
290 return u.i >> 31;
291 }
292
float_near_ulp(const float a,const float b,const unsigned max_ulp)293 int float_near_ulp(const float a, const float b, const unsigned max_ulp) {
294 intfloat x, y;
295
296 x.f = a;
297 y.f = b;
298
299 if (is_negative(x) != is_negative(y)) {
300 // handle -0.0 == +0.0
301 return a == b;
302 }
303
304 if (llabs((int64_t)x.i - y.i) <= max_ulp)
305 return 1;
306
307 return 0;
308 }
309
float_near_ulp_array(const float * const a,const float * const b,const unsigned max_ulp,const int len)310 int float_near_ulp_array(const float *const a, const float *const b,
311 const unsigned max_ulp, const int len)
312 {
313 for (int i = 0; i < len; i++)
314 if (!float_near_ulp(a[i], b[i], max_ulp))
315 return 0;
316
317 return 1;
318 }
319
float_near_abs_eps(const float a,const float b,const float eps)320 int float_near_abs_eps(const float a, const float b, const float eps) {
321 return fabsf(a - b) < eps;
322 }
323
float_near_abs_eps_array(const float * const a,const float * const b,const float eps,const int len)324 int float_near_abs_eps_array(const float *const a, const float *const b,
325 const float eps, const int len)
326 {
327 for (int i = 0; i < len; i++)
328 if (!float_near_abs_eps(a[i], b[i], eps))
329 return 0;
330
331 return 1;
332 }
333
float_near_abs_eps_ulp(const float a,const float b,const float eps,const unsigned max_ulp)334 int float_near_abs_eps_ulp(const float a, const float b, const float eps,
335 const unsigned max_ulp)
336 {
337 return float_near_ulp(a, b, max_ulp) || float_near_abs_eps(a, b, eps);
338 }
339
float_near_abs_eps_array_ulp(const float * const a,const float * const b,const float eps,const unsigned max_ulp,const int len)340 int float_near_abs_eps_array_ulp(const float *const a, const float *const b,
341 const float eps, const unsigned max_ulp,
342 const int len)
343 {
344 for (int i = 0; i < len; i++)
345 if (!float_near_abs_eps_ulp(a[i], b[i], eps, max_ulp))
346 return 0;
347
348 return 1;
349 }
350
351 /* Print colored text to stderr if the terminal supports it */
352 static int use_printf_color;
color_fprintf(FILE * const f,const int color,const char * const fmt,...)353 static void color_fprintf(FILE *const f, const int color, const char *const fmt, ...) {
354 va_list arg;
355
356 if (use_printf_color)
357 fprintf(f, "\x1b[0;%dm", color);
358
359 va_start(arg, fmt);
360 vfprintf(f, fmt, arg);
361 va_end(arg);
362
363 if (use_printf_color)
364 fprintf(f, "\x1b[0m");
365 }
366
367 /* Deallocate a tree */
destroy_func_tree(CheckasmFunc * const f)368 static void destroy_func_tree(CheckasmFunc *const f) {
369 if (f) {
370 CheckasmFuncVersion *v = f->versions.next;
371 while (v) {
372 CheckasmFuncVersion *next = v->next;
373 free(v);
374 v = next;
375 }
376
377 destroy_func_tree(f->child[0]);
378 destroy_func_tree(f->child[1]);
379 free(f);
380 }
381 }
382
383 /* Allocate a zero-initialized block, clean up and exit on failure */
checkasm_malloc(const size_t size)384 static void *checkasm_malloc(const size_t size) {
385 void *const ptr = calloc(1, size);
386 if (!ptr) {
387 fprintf(stderr, "checkasm: malloc failed\n");
388 destroy_func_tree(state.funcs);
389 exit(1);
390 }
391 return ptr;
392 }
393
394 /* Get the suffix of the specified cpu flag */
cpu_suffix(const unsigned cpu)395 static const char *cpu_suffix(const unsigned cpu) {
396 for (int i = (int)(sizeof(cpus) / sizeof(*cpus)) - 2; i >= 0; i--)
397 if (cpu & cpus[i].flag)
398 return cpus[i].suffix;
399
400 return "c";
401 }
402
403 #ifdef readtime
cmp_nop(const void * a,const void * b)404 static int cmp_nop(const void *a, const void *b) {
405 return *(const uint16_t*)a - *(const uint16_t*)b;
406 }
407
408 /* Measure the overhead of the timing code (in decicycles) */
measure_nop_time(void)409 static double measure_nop_time(void) {
410 uint16_t nops[10000];
411 int nop_sum = 0;
412
413 for (int i = 0; i < 10000; i++) {
414 uint64_t t = readtime();
415 nops[i] = (uint16_t) (readtime() - t);
416 }
417
418 qsort(nops, 10000, sizeof(uint16_t), cmp_nop);
419 for (int i = 2500; i < 7500; i++)
420 nop_sum += nops[i];
421
422 return nop_sum / 5000.0;
423 }
424
avg_cycles_per_call(const CheckasmFuncVersion * const v)425 static double avg_cycles_per_call(const CheckasmFuncVersion *const v) {
426 if (v->iterations) {
427 const double cycles = (double)v->cycles / v->iterations - state.nop_time;
428 if (cycles > 0.0)
429 return cycles / 4.0; /* 4 calls per iteration */
430 }
431 return 0.0;
432 }
433
434 /* Print benchmark results */
print_benchs(const CheckasmFunc * const f)435 static void print_benchs(const CheckasmFunc *const f) {
436 if (f) {
437 print_benchs(f->child[0]);
438
439 /* Only print functions with at least one assembly version */
440 const CheckasmFuncVersion *v = &f->versions;
441 if (v->iterations) {
442 const double baseline = avg_cycles_per_call(v);
443 do {
444 const int pad_length = 10 + state.max_function_name_length -
445 printf("%s_%s:", f->name, cpu_suffix(v->cpu));
446 const double cycles = avg_cycles_per_call(v);
447 const double ratio = cycles ? baseline / cycles : 0.0;
448 printf("%*.1f (%5.2fx)\n", imax(pad_length, 0), cycles, ratio);
449 } while ((v = v->next));
450 }
451
452 print_benchs(f->child[1]);
453 }
454 }
455 #endif
456
print_functions(const CheckasmFunc * const f)457 static void print_functions(const CheckasmFunc *const f) {
458 if (f) {
459 print_functions(f->child[0]);
460 const CheckasmFuncVersion *v = &f->versions;
461 printf("%s (%s", f->name, cpu_suffix(v->cpu));
462 while ((v = v->next))
463 printf(", %s", cpu_suffix(v->cpu));
464 printf(")\n");
465 print_functions(f->child[1]);
466 }
467 }
468
469 #define is_digit(x) ((x) >= '0' && (x) <= '9')
470
471 /* ASCIIbetical sort except preserving natural order for numbers */
cmp_func_names(const char * a,const char * b)472 static int cmp_func_names(const char *a, const char *b) {
473 const char *const start = a;
474 int ascii_diff, digit_diff;
475
476 for (; !(ascii_diff = *(const unsigned char*)a -
477 *(const unsigned char*)b) && *a; a++, b++);
478 for (; is_digit(*a) && is_digit(*b); a++, b++);
479
480 if (a > start && is_digit(a[-1]) &&
481 (digit_diff = is_digit(*a) - is_digit(*b)))
482 {
483 return digit_diff;
484 }
485
486 return ascii_diff;
487 }
488
489 /* Perform a tree rotation in the specified direction and return the new root */
rotate_tree(CheckasmFunc * const f,const int dir)490 static CheckasmFunc *rotate_tree(CheckasmFunc *const f, const int dir) {
491 CheckasmFunc *const r = f->child[dir^1];
492 f->child[dir^1] = r->child[dir];
493 r->child[dir] = f;
494 r->color = f->color;
495 f->color = 0;
496 return r;
497 }
498
499 #define is_red(f) ((f) && !(f)->color)
500
501 /* Balance a left-leaning red-black tree at the specified node */
balance_tree(CheckasmFunc ** const root)502 static void balance_tree(CheckasmFunc **const root) {
503 CheckasmFunc *const f = *root;
504
505 if (is_red(f->child[0]) && is_red(f->child[1])) {
506 f->color ^= 1;
507 f->child[0]->color = f->child[1]->color = 1;
508 }
509 else if (!is_red(f->child[0]) && is_red(f->child[1]))
510 *root = rotate_tree(f, 0); /* Rotate left */
511 else if (is_red(f->child[0]) && is_red(f->child[0]->child[0]))
512 *root = rotate_tree(f, 1); /* Rotate right */
513 }
514
515 /* Get a node with the specified name, creating it if it doesn't exist */
get_func(CheckasmFunc ** const root,const char * const name)516 static CheckasmFunc *get_func(CheckasmFunc **const root, const char *const name) {
517 CheckasmFunc *f = *root;
518
519 if (f) {
520 /* Search the tree for a matching node */
521 const int cmp = cmp_func_names(name, f->name);
522 if (cmp) {
523 f = get_func(&f->child[cmp > 0], name);
524
525 /* Rebalance the tree on the way up if a new node was inserted */
526 if (!f->versions.func)
527 balance_tree(root);
528 }
529 } else {
530 /* Allocate and insert a new node into the tree */
531 const size_t name_length = strlen(name) + 1;
532 f = *root = checkasm_malloc(offsetof(CheckasmFunc, name) + name_length);
533 memcpy(f->name, name, name_length);
534 }
535
536 return f;
537 }
538
539 checkasm_context checkasm_context_buf;
540
541 /* Crash handling: attempt to catch crashes and handle them
542 * gracefully instead of just aborting abruptly. */
543 #ifdef _WIN32
544 #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
signal_handler(EXCEPTION_POINTERS * const e)545 static LONG NTAPI signal_handler(EXCEPTION_POINTERS *const e) {
546 if (state.sig == SIG_ATOMIC_MAX) {
547 int s;
548 switch (e->ExceptionRecord->ExceptionCode) {
549 case EXCEPTION_FLT_DIVIDE_BY_ZERO:
550 case EXCEPTION_INT_DIVIDE_BY_ZERO:
551 s = SIGFPE;
552 break;
553 case EXCEPTION_ILLEGAL_INSTRUCTION:
554 case EXCEPTION_PRIV_INSTRUCTION:
555 s = SIGILL;
556 break;
557 case EXCEPTION_ACCESS_VIOLATION:
558 case EXCEPTION_ARRAY_BOUNDS_EXCEEDED:
559 case EXCEPTION_DATATYPE_MISALIGNMENT:
560 case EXCEPTION_STACK_OVERFLOW:
561 s = SIGSEGV;
562 break;
563 case EXCEPTION_IN_PAGE_ERROR:
564 s = SIGBUS;
565 break;
566 default:
567 return EXCEPTION_CONTINUE_SEARCH;
568 }
569 state.sig = s;
570 checkasm_load_context();
571 }
572 return EXCEPTION_CONTINUE_SEARCH;
573 }
574 #endif
575 #else
576 static void signal_handler(int s);
577
578 static const struct sigaction signal_handler_act = {
579 .sa_handler = signal_handler,
580 .sa_flags = SA_RESETHAND,
581 };
582
signal_handler(const int s)583 static void signal_handler(const int s) {
584 if (state.sig == SIG_ATOMIC_MAX) {
585 state.sig = s;
586 sigaction(s, &signal_handler_act, NULL);
587 checkasm_load_context();
588 }
589 }
590 #endif
591
592 /* Compares a string with a wildcard pattern. */
wildstrcmp(const char * str,const char * pattern)593 static int wildstrcmp(const char *str, const char *pattern) {
594 const char *wild = strchr(pattern, '*');
595 if (wild) {
596 const size_t len = wild - pattern;
597 if (strncmp(str, pattern, len)) return 1;
598 while (*++wild == '*');
599 if (!*wild) return 0;
600 str += len;
601 while (*str && wildstrcmp(str, wild)) str++;
602 return !*str;
603 }
604 return strcmp(str, pattern);
605 }
606
607 /* Perform tests and benchmarks for the specified
608 * cpu flag if supported by the host */
check_cpu_flag(const char * const name,unsigned flag)609 static void check_cpu_flag(const char *const name, unsigned flag) {
610 const unsigned old_cpu_flag = state.cpu_flag;
611
612 flag |= old_cpu_flag;
613 dav1d_set_cpu_flags_mask(flag);
614 state.cpu_flag = dav1d_get_cpu_flags();
615
616 if (!flag || state.cpu_flag != old_cpu_flag) {
617 state.cpu_flag_name = name;
618 state.suffix_length = (int)strlen(cpu_suffix(flag)) + 1;
619 for (int i = 0; tests[i].func; i++) {
620 if (state.test_pattern && wildstrcmp(tests[i].name, state.test_pattern))
621 continue;
622 xor128_srand(state.seed);
623 state.current_test_name = tests[i].name;
624 tests[i].func();
625 }
626 }
627 }
628
629 /* Print the name of the current CPU flag, but only do it once */
print_cpu_name(void)630 static void print_cpu_name(void) {
631 if (state.cpu_flag_name) {
632 color_fprintf(stderr, COLOR_YELLOW, "%s:\n", state.cpu_flag_name);
633 state.cpu_flag_name = NULL;
634 }
635 }
636
get_seed(void)637 static unsigned get_seed(void) {
638 #ifdef _WIN32
639 LARGE_INTEGER i;
640 QueryPerformanceCounter(&i);
641 return i.LowPart;
642 #elif defined(__APPLE__)
643 return (unsigned) mach_absolute_time();
644 #else
645 struct timespec ts;
646 clock_gettime(CLOCK_MONOTONIC, &ts);
647 return (unsigned) (1000000000ULL * ts.tv_sec + ts.tv_nsec);
648 #endif
649 }
650
checkasm_strtoul(unsigned long * const dst,const char * const str,const int base)651 static int checkasm_strtoul(unsigned long *const dst, const char *const str, const int base) {
652 char *end;
653 errno = 0;
654 *dst = strtoul(str, &end, base);
655 return errno || end == str || *end;
656 }
657
main(int argc,char * argv[])658 int main(int argc, char *argv[]) {
659 state.seed = get_seed();
660
661 while (argc > 1) {
662 if (!strncmp(argv[1], "--help", 6) || !strcmp(argv[1], "-h")) {
663 fprintf(stderr,
664 "checkasm [options] <random seed>\n"
665 " <random seed> Numeric value to seed the rng\n"
666 "Options:\n"
667 " --affinity=<cpu> Run the process on CPU <cpu>\n"
668 " --test=<pattern> -t Test only <pattern>\n"
669 " --function=<pattern> -f Test only the functions matching <pattern>\n"
670 " --bench -b Benchmark the tested functions\n"
671 " --list-cpuflags List available cpu flags\n"
672 " --list-functions List available functions\n"
673 " --list-tests List available tests\n"
674 " --verbose -v Print verbose output\n");
675 return 0;
676 } else if (!strcmp(argv[1], "--bench") || !strcmp(argv[1], "-b")) {
677 #ifndef readtime
678 fprintf(stderr,
679 "checkasm: --bench is not supported on your system\n");
680 return 1;
681 #endif
682 state.run_mode = RUN_BENCHMARK;
683 } else if (!strncmp(argv[1], "--test=", 7)) {
684 state.test_pattern = argv[1] + 7;
685 } else if (!strcmp(argv[1], "-t")) {
686 state.test_pattern = argc > 1 ? argv[2] : "";
687 argc--;
688 argv++;
689 } else if (!strncmp(argv[1], "--function=", 11)) {
690 state.function_pattern = argv[1] + 11;
691 } else if (!strcmp(argv[1], "-f")) {
692 state.function_pattern = argc > 1 ? argv[2] : "";
693 argc--;
694 argv++;
695 } else if (!strcmp(argv[1], "--list-cpuflags")) {
696 state.run_mode = RUN_CPUFLAG_LISTING;
697 break;
698 } else if (!strcmp(argv[1], "--list-functions")) {
699 state.run_mode = RUN_FUNCTION_LISTING;
700 } else if (!strcmp(argv[1], "--list-tests")) {
701 for (int i = 0; tests[i].name; i++)
702 printf("%s\n", tests[i].name);
703 return 0;
704 } else if (!strcmp(argv[1], "--verbose") || !strcmp(argv[1], "-v")) {
705 state.verbose = 1;
706 } else if (!strncmp(argv[1], "--affinity=", 11)) {
707 const char *const s = argv[1] + 11;
708 unsigned long affinity;
709 if (checkasm_strtoul(&affinity, s, 16)) {
710 fprintf(stderr, "checkasm: invalid cpu affinity (%s)\n", s);
711 return 1;
712 }
713 #ifdef _WIN32
714 int affinity_err;
715 HANDLE process = GetCurrentProcess();
716 #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
717 BOOL (WINAPI *spdcs)(HANDLE, const ULONG*, ULONG) =
718 (void*)GetProcAddress(GetModuleHandleW(L"kernel32.dll"), "SetProcessDefaultCpuSets");
719 if (spdcs)
720 affinity_err = !spdcs(process, (ULONG[]){ affinity + 256 }, 1);
721 else
722 #endif
723 {
724 if (affinity < sizeof(DWORD_PTR) * 8)
725 affinity_err = !SetProcessAffinityMask(process, (DWORD_PTR)1 << affinity);
726 else
727 affinity_err = 1;
728 }
729 if (affinity_err) {
730 fprintf(stderr, "checkasm: invalid cpu affinity (%lu)\n", affinity);
731 return 1;
732 } else {
733 fprintf(stderr, "checkasm: running on cpu %lu\n", affinity);
734 }
735 #elif defined(HAVE_PTHREAD_SETAFFINITY_NP) && defined(CPU_SET)
736 cpu_set_t set;
737 CPU_ZERO(&set);
738 CPU_SET(affinity, &set);
739 if (pthread_setaffinity_np(pthread_self(), sizeof(set), &set)) {
740 fprintf(stderr, "checkasm: invalid cpu affinity (%lu)\n", affinity);
741 return 1;
742 } else {
743 fprintf(stderr, "checkasm: running on cpu %lu\n", affinity);
744 }
745 #else
746 (void)affinity;
747 fprintf(stderr,
748 "checkasm: --affinity is not supported on your system\n");
749 return 1;
750 #endif
751 } else {
752 unsigned long seed;
753 if (checkasm_strtoul(&seed, argv[1], 10)) {
754 fprintf(stderr, "checkasm: unknown option (%s)\n", argv[1]);
755 return 1;
756 }
757 state.seed = (unsigned)seed;
758 }
759
760 argc--;
761 argv++;
762 }
763
764 #if TRIM_DSP_FUNCTIONS
765 fprintf(stderr, "checkasm: reference functions unavailable, reconfigure using '-Dtrim_dsp=false'\n");
766 return 0;
767 #endif
768
769 dav1d_init_cpu();
770
771 #ifdef _WIN32
772 #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
773 AddVectoredExceptionHandler(0, signal_handler);
774
775 HANDLE con = GetStdHandle(state.run_mode >= RUN_CPUFLAG_LISTING ?
776 STD_OUTPUT_HANDLE : STD_ERROR_HANDLE);
777 DWORD con_mode = 0;
778 use_printf_color = con && con != INVALID_HANDLE_VALUE &&
779 GetConsoleMode(con, &con_mode) &&
780 SetConsoleMode(con, con_mode | ENABLE_VIRTUAL_TERMINAL_PROCESSING);
781 #endif
782 #else
783 sigaction(SIGBUS, &signal_handler_act, NULL);
784 sigaction(SIGFPE, &signal_handler_act, NULL);
785 sigaction(SIGILL, &signal_handler_act, NULL);
786 sigaction(SIGSEGV, &signal_handler_act, NULL);
787
788 if (isatty(state.run_mode >= RUN_CPUFLAG_LISTING ? 1 : 2)) {
789 const char *const term = getenv("TERM");
790 use_printf_color = term && strcmp(term, "dumb");
791 }
792 #endif
793
794 #ifdef readtime
795 if (state.run_mode == RUN_BENCHMARK) {
796 #if CONFIG_MACOS_KPERF
797 if (kperf_init())
798 return 1;
799 #endif
800 if (!checkasm_save_context()) {
801 checkasm_set_signal_handler_state(1);
802 readtime();
803 checkasm_set_signal_handler_state(0);
804 } else {
805 fprintf(stderr, "checkasm: unable to access cycle counter\n");
806 return 1;
807 }
808 }
809 #endif
810
811 int ret = 0;
812
813 if (state.run_mode != RUN_FUNCTION_LISTING) {
814 const unsigned cpu_flags = dav1d_get_cpu_flags();
815 if (state.run_mode == RUN_CPUFLAG_LISTING) {
816 const int last_i = (int)(sizeof(cpus) / sizeof(*cpus)) - 2;
817 for (int i = 0; i <= last_i ; i++) {
818 if (cpus[i].flag & cpu_flags)
819 color_fprintf(stdout, COLOR_GREEN, "%s", cpus[i].suffix);
820 else
821 color_fprintf(stdout, COLOR_RED, "~%s", cpus[i].suffix);
822 printf(i == last_i ? "\n" : ", ");
823 }
824 return 0;
825 }
826 #if ARCH_X86_64
827 void checkasm_warmup_avx2(void);
828 void checkasm_warmup_avx512(void);
829 if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX512ICL)
830 state.simd_warmup = checkasm_warmup_avx512;
831 else if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX2)
832 state.simd_warmup = checkasm_warmup_avx2;
833 checkasm_simd_warmup();
834 #endif
835 #if ARCH_X86
836 unsigned checkasm_init_x86(char *name);
837 char name[48];
838 const unsigned cpuid = checkasm_init_x86(name);
839 for (size_t len = strlen(name); len && name[len-1] == ' '; len--)
840 name[len-1] = '\0'; /* trim trailing whitespace */
841 fprintf(stderr, "checkasm: %s (%08X) using random seed %u\n", name, cpuid, state.seed);
842 #elif ARCH_RISCV
843 char buf[32] = "";
844 if (cpu_flags & DAV1D_RISCV_CPU_FLAG_V) {
845 const int vlen = 8*checkasm_get_vlenb();
846 snprintf(buf, sizeof(buf), "VLEN=%i bits, ", vlen);
847 }
848 fprintf(stderr, "checkasm: %susing random seed %u\n", buf, state.seed);
849 #elif ARCH_AARCH64 && HAVE_SVE
850 char buf[48] = "";
851 if (cpu_flags & DAV1D_ARM_CPU_FLAG_SVE)
852 snprintf(buf, sizeof(buf), "SVE %d bits, ", checkasm_sve_length());
853 fprintf(stderr, "checkasm: %susing random seed %u\n", buf, state.seed);
854 #else
855 fprintf(stderr, "checkasm: using random seed %u\n", state.seed);
856 #endif
857 }
858
859 check_cpu_flag(NULL, 0);
860 for (int i = 0; cpus[i].flag; i++)
861 check_cpu_flag(cpus[i].name, cpus[i].flag);
862
863 if (state.run_mode == RUN_FUNCTION_LISTING) {
864 print_functions(state.funcs);
865 } else if (state.num_failed) {
866 fprintf(stderr, "checkasm: %d of %d tests failed\n",
867 state.num_failed, state.num_checked);
868 ret = 1;
869 } else {
870 if (state.num_checked)
871 fprintf(stderr, "checkasm: all %d tests passed\n", state.num_checked);
872 else
873 fprintf(stderr, "checkasm: no tests to perform\n");
874 #ifdef readtime
875 if (state.run_mode == RUN_BENCHMARK && state.max_function_name_length) {
876 state.nop_time = measure_nop_time();
877 if (state.verbose)
878 printf("nop:%*.1f\n", state.max_function_name_length + 6, state.nop_time);
879 print_benchs(state.funcs);
880 }
881 #endif
882 }
883
884 destroy_func_tree(state.funcs);
885 return ret;
886 }
887
888 /* Decide whether or not the specified function needs to be tested and
889 * allocate/initialize data structures if needed. Returns a pointer to a
890 * reference function if the function should be tested, otherwise NULL */
checkasm_check_func(void * const func,const char * const name,...)891 void *checkasm_check_func(void *const func, const char *const name, ...) {
892 char name_buf[256];
893 va_list arg;
894
895 va_start(arg, name);
896 int name_length = vsnprintf(name_buf, sizeof(name_buf), name, arg);
897 va_end(arg);
898
899 if (!func || name_length <= 0 || (size_t)name_length >= sizeof(name_buf) ||
900 (state.function_pattern && wildstrcmp(name_buf, state.function_pattern)))
901 {
902 return NULL;
903 }
904
905 state.current_func = get_func(&state.funcs, name_buf);
906
907 state.funcs->color = 1;
908 CheckasmFuncVersion *v = &state.current_func->versions;
909 void *ref = func;
910
911 if (v->func) {
912 CheckasmFuncVersion *prev;
913 do {
914 /* Only test functions that haven't already been tested */
915 if (v->func == func)
916 return NULL;
917
918 if (v->ok)
919 ref = v->func;
920
921 prev = v;
922 } while ((v = v->next));
923
924 v = prev->next = checkasm_malloc(sizeof(CheckasmFuncVersion));
925 }
926
927 name_length += state.suffix_length;
928 if (name_length > state.max_function_name_length)
929 state.max_function_name_length = name_length;
930
931 v->func = func;
932 v->ok = 1;
933 v->cpu = state.cpu_flag;
934 state.current_func_ver = v;
935 if (state.run_mode == RUN_FUNCTION_LISTING) /* Save function names without running tests */
936 return NULL;
937
938 xor128_srand(state.seed);
939
940 if (state.cpu_flag)
941 state.num_checked++;
942
943 return ref;
944 }
945
946 /* Decide whether or not the current function needs to be benchmarked */
checkasm_bench_func(void)947 int checkasm_bench_func(void) {
948 return !state.num_failed && state.run_mode == RUN_BENCHMARK;
949 }
950
951 /* Indicate that the current test has failed, return whether verbose printing
952 * is requested. */
checkasm_fail_func(const char * const msg,...)953 int checkasm_fail_func(const char *const msg, ...) {
954 if (state.current_func_ver && state.current_func_ver->cpu &&
955 state.current_func_ver->ok)
956 {
957 va_list arg;
958
959 print_cpu_name();
960 fprintf(stderr, " %s_%s (", state.current_func->name,
961 cpu_suffix(state.current_func_ver->cpu));
962 va_start(arg, msg);
963 vfprintf(stderr, msg, arg);
964 va_end(arg);
965 fprintf(stderr, ")\n");
966
967 state.current_func_ver->ok = 0;
968 state.num_failed++;
969 }
970 return state.verbose;
971 }
972
973 /* Update benchmark results of the current function */
checkasm_update_bench(const int iterations,const uint64_t cycles)974 void checkasm_update_bench(const int iterations, const uint64_t cycles) {
975 state.current_func_ver->iterations += iterations;
976 state.current_func_ver->cycles += cycles;
977 }
978
979 /* Print the outcome of all tests performed since
980 * the last time this function was called */
checkasm_report(const char * const name,...)981 void checkasm_report(const char *const name, ...) {
982 static int prev_checked, prev_failed;
983 static size_t max_length;
984
985 if (state.num_checked > prev_checked) {
986 int pad_length = (int) max_length + 4;
987 va_list arg;
988
989 print_cpu_name();
990 pad_length -= fprintf(stderr, " - %s.", state.current_test_name);
991 va_start(arg, name);
992 pad_length -= vfprintf(stderr, name, arg);
993 va_end(arg);
994 fprintf(stderr, "%*c", imax(pad_length, 0) + 2, '[');
995
996 if (state.num_failed == prev_failed)
997 color_fprintf(stderr, COLOR_GREEN, "OK");
998 else
999 color_fprintf(stderr, COLOR_RED, "FAILED");
1000 fprintf(stderr, "]\n");
1001
1002 prev_checked = state.num_checked;
1003 prev_failed = state.num_failed;
1004 } else if (!state.cpu_flag) {
1005 /* Calculate the amount of padding required
1006 * to make the output vertically aligned */
1007 size_t length = strlen(state.current_test_name);
1008 va_list arg;
1009
1010 va_start(arg, name);
1011 length += vsnprintf(NULL, 0, name, arg);
1012 va_end(arg);
1013
1014 if (length > max_length)
1015 max_length = length;
1016 }
1017 }
1018
checkasm_set_signal_handler_state(const int enabled)1019 void checkasm_set_signal_handler_state(const int enabled) {
1020 state.sig = enabled ? SIG_ATOMIC_MAX : 0;
1021 }
1022
checkasm_handle_signal(void)1023 void checkasm_handle_signal(void) {
1024 const int s = state.sig;
1025 checkasm_fail_func(s == SIGFPE ? "fatal arithmetic error" :
1026 s == SIGILL ? "illegal instruction" :
1027 s == SIGBUS ? "bus error" :
1028 "segmentation fault");
1029 }
1030
check_err(const char * const file,const int line,const char * const name,const int w,const int h,int * const err)1031 static int check_err(const char *const file, const int line,
1032 const char *const name, const int w, const int h,
1033 int *const err)
1034 {
1035 if (*err)
1036 return 0;
1037 if (!checkasm_fail_func("%s:%d", file, line))
1038 return 1;
1039 *err = 1;
1040 fprintf(stderr, "%s (%dx%d):\n", name, w, h);
1041 return 0;
1042 }
1043
1044 #define DEF_CHECKASM_CHECK_FUNC(type, fmt) \
1045 int checkasm_check_##type(const char *const file, const int line, \
1046 const type *buf1, ptrdiff_t stride1, \
1047 const type *buf2, ptrdiff_t stride2, \
1048 const int w, int h, const char *const name, \
1049 const int align_w, const int align_h, \
1050 const int padding) \
1051 { \
1052 int aligned_w = (w + align_w - 1) & ~(align_w - 1); \
1053 int aligned_h = (h + align_h - 1) & ~(align_h - 1); \
1054 int err = 0; \
1055 stride1 /= sizeof(*buf1); \
1056 stride2 /= sizeof(*buf2); \
1057 int y = 0; \
1058 for (y = 0; y < h; y++) \
1059 if (memcmp(&buf1[y*stride1], &buf2[y*stride2], w*sizeof(*buf1))) \
1060 break; \
1061 if (y != h) { \
1062 if (check_err(file, line, name, w, h, &err)) \
1063 return 1; \
1064 for (y = 0; y < h; y++) { \
1065 for (int x = 0; x < w; x++) \
1066 fprintf(stderr, " " fmt, buf1[x]); \
1067 fprintf(stderr, " "); \
1068 for (int x = 0; x < w; x++) \
1069 fprintf(stderr, " " fmt, buf2[x]); \
1070 fprintf(stderr, " "); \
1071 for (int x = 0; x < w; x++) \
1072 fprintf(stderr, "%c", buf1[x] != buf2[x] ? 'x' : '.'); \
1073 buf1 += stride1; \
1074 buf2 += stride2; \
1075 fprintf(stderr, "\n"); \
1076 } \
1077 buf1 -= h*stride1; \
1078 buf2 -= h*stride2; \
1079 } \
1080 for (y = -padding; y < 0; y++) \
1081 if (memcmp(&buf1[y*stride1 - padding], &buf2[y*stride2 - padding], \
1082 (w + 2*padding)*sizeof(*buf1))) { \
1083 if (check_err(file, line, name, w, h, &err)) \
1084 return 1; \
1085 fprintf(stderr, " overwrite above\n"); \
1086 break; \
1087 } \
1088 for (y = aligned_h; y < aligned_h + padding; y++) \
1089 if (memcmp(&buf1[y*stride1 - padding], &buf2[y*stride2 - padding], \
1090 (w + 2*padding)*sizeof(*buf1))) { \
1091 if (check_err(file, line, name, w, h, &err)) \
1092 return 1; \
1093 fprintf(stderr, " overwrite below\n"); \
1094 break; \
1095 } \
1096 for (y = 0; y < h; y++) \
1097 if (memcmp(&buf1[y*stride1 - padding], &buf2[y*stride2 - padding], \
1098 padding*sizeof(*buf1))) { \
1099 if (check_err(file, line, name, w, h, &err)) \
1100 return 1; \
1101 fprintf(stderr, " overwrite left\n"); \
1102 break; \
1103 } \
1104 for (y = 0; y < h; y++) \
1105 if (memcmp(&buf1[y*stride1 + aligned_w], &buf2[y*stride2 + aligned_w], \
1106 padding*sizeof(*buf1))) { \
1107 if (check_err(file, line, name, w, h, &err)) \
1108 return 1; \
1109 fprintf(stderr, " overwrite right\n"); \
1110 break; \
1111 } \
1112 return err; \
1113 }
1114
1115 DEF_CHECKASM_CHECK_FUNC(int8_t, "%4d")
1116 DEF_CHECKASM_CHECK_FUNC(int16_t, "%6d")
1117 DEF_CHECKASM_CHECK_FUNC(int32_t, "%9d")
1118 DEF_CHECKASM_CHECK_FUNC(uint8_t, "%02x")
1119 DEF_CHECKASM_CHECK_FUNC(uint16_t, "%04x")
1120 DEF_CHECKASM_CHECK_FUNC(uint32_t, "%08x")
1121
1122 #if ARCH_X86_64
checkasm_simd_warmup(void)1123 void checkasm_simd_warmup(void)
1124 {
1125 if (state.simd_warmup)
1126 state.simd_warmup();
1127 }
1128 #endif
1129