• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018, VideoLAN and dav1d authors
3  * Copyright © 2018, Two Orioles, LLC
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  *    list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright notice,
13  *    this list of conditions and the following disclaimer in the documentation
14  *    and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 #include "tests/checkasm/checkasm.h"
28 
29 #include <errno.h>
30 #include <math.h>
31 #include <signal.h>
32 #include <stdarg.h>
33 #include <stdio.h>
34 #include <string.h>
35 
36 #include "src/cpu.h"
37 
38 #ifdef _WIN32
39 #ifndef SIGBUS
40 /* non-standard, use the same value as mingw-w64 */
41 #define SIGBUS 10
42 #endif
43 #ifndef ENABLE_VIRTUAL_TERMINAL_PROCESSING
44 #define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x04
45 #endif
46 #else
47 #include <unistd.h>
48 #include <time.h>
49 #include <pthread.h>
50 #ifdef HAVE_PTHREAD_NP_H
51 #include <pthread_np.h>
52 #endif
53 #ifdef __APPLE__
54 #include <mach/mach_time.h>
55 #endif
56 #endif
57 #if CONFIG_MACOS_KPERF
58 #include <dlfcn.h>
59 #endif
60 
61 #define COLOR_RED    31
62 #define COLOR_GREEN  32
63 #define COLOR_YELLOW 33
64 
65 /* List of tests to invoke */
66 static const struct {
67     const char *name;
68     void (*func)(void);
69 } tests[] = {
70     { "msac", checkasm_check_msac },
71     { "pal", checkasm_check_pal },
72     { "refmvs", checkasm_check_refmvs },
73 #if CONFIG_8BPC
74     { "cdef_8bpc", checkasm_check_cdef_8bpc },
75     { "filmgrain_8bpc", checkasm_check_filmgrain_8bpc },
76     { "ipred_8bpc", checkasm_check_ipred_8bpc },
77     { "itx_8bpc", checkasm_check_itx_8bpc },
78     { "loopfilter_8bpc", checkasm_check_loopfilter_8bpc },
79     { "looprestoration_8bpc", checkasm_check_looprestoration_8bpc },
80     { "mc_8bpc", checkasm_check_mc_8bpc },
81 #endif
82 #if CONFIG_16BPC
83     { "cdef_16bpc", checkasm_check_cdef_16bpc },
84     { "filmgrain_16bpc", checkasm_check_filmgrain_16bpc },
85     { "ipred_16bpc", checkasm_check_ipred_16bpc },
86     { "itx_16bpc", checkasm_check_itx_16bpc },
87     { "loopfilter_16bpc", checkasm_check_loopfilter_16bpc },
88     { "looprestoration_16bpc", checkasm_check_looprestoration_16bpc },
89     { "mc_16bpc", checkasm_check_mc_16bpc },
90 #endif
91     { 0 }
92 };
93 
94 /* List of cpu flags to check */
95 static const struct {
96     const char *name;
97     const char *suffix;
98     unsigned flag;
99 } cpus[] = {
100 #if ARCH_X86
101     { "SSE2",               "sse2",      DAV1D_X86_CPU_FLAG_SSE2 },
102     { "SSSE3",              "ssse3",     DAV1D_X86_CPU_FLAG_SSSE3 },
103     { "SSE4.1",             "sse4",      DAV1D_X86_CPU_FLAG_SSE41 },
104     { "AVX2",               "avx2",      DAV1D_X86_CPU_FLAG_AVX2 },
105     { "AVX-512 (Ice Lake)", "avx512icl", DAV1D_X86_CPU_FLAG_AVX512ICL },
106 #elif ARCH_AARCH64 || ARCH_ARM
107     { "NEON",               "neon",      DAV1D_ARM_CPU_FLAG_NEON },
108     { "DOTPROD",            "dotprod",   DAV1D_ARM_CPU_FLAG_DOTPROD },
109     { "I8MM",               "i8mm",      DAV1D_ARM_CPU_FLAG_I8MM },
110 #if ARCH_AARCH64
111     { "SVE",                "sve",       DAV1D_ARM_CPU_FLAG_SVE },
112     { "SVE2",               "sve2",      DAV1D_ARM_CPU_FLAG_SVE2 },
113 #endif /* ARCH_AARCH64 */
114 #elif ARCH_LOONGARCH
115     { "LSX",                "lsx",       DAV1D_LOONGARCH_CPU_FLAG_LSX },
116     { "LASX",               "lasx",      DAV1D_LOONGARCH_CPU_FLAG_LASX },
117 #elif ARCH_PPC64LE
118     { "VSX",                "vsx",       DAV1D_PPC_CPU_FLAG_VSX },
119     { "PWR9",               "pwr9",      DAV1D_PPC_CPU_FLAG_PWR9 },
120 #elif ARCH_RISCV
121     { "RVV",                "rvv",       DAV1D_RISCV_CPU_FLAG_V },
122 #endif
123     { 0 }
124 };
125 
126 #if ARCH_AARCH64 && HAVE_SVE
127 int checkasm_sve_length(void);
128 #elif ARCH_RISCV
129 int checkasm_get_vlenb(void);
130 #endif
131 
132 typedef struct CheckasmFuncVersion {
133     struct CheckasmFuncVersion *next;
134     void *func;
135     int ok;
136     unsigned cpu;
137     int iterations;
138     uint64_t cycles;
139 } CheckasmFuncVersion;
140 
141 /* Binary search tree node */
142 typedef struct CheckasmFunc {
143     struct CheckasmFunc *child[2];
144     CheckasmFuncVersion versions;
145     uint8_t color; /* 0 = red, 1 = black */
146     char name[];
147 } CheckasmFunc;
148 
149 typedef enum {
150     RUN_NORMAL = 0,
151     RUN_BENCHMARK,
152     RUN_CPUFLAG_LISTING,
153     RUN_FUNCTION_LISTING,
154 } CheckasmRunMode;
155 
156 /* Internal state */
157 static struct {
158     CheckasmFunc *funcs;
159     CheckasmFunc *current_func;
160     CheckasmFuncVersion *current_func_ver;
161     const char *current_test_name;
162     int num_checked;
163     int num_failed;
164     double nop_time;
165     unsigned cpu_flag;
166     const char *cpu_flag_name;
167     const char *test_pattern;
168     const char *function_pattern;
169     unsigned seed;
170     CheckasmRunMode run_mode;
171     int verbose;
172     volatile sig_atomic_t sig; // SIG_ATOMIC_MAX = signal handling enabled
173     int suffix_length;
174     int max_function_name_length;
175 #if ARCH_X86_64
176     void (*simd_warmup)(void);
177 #endif
178 } state;
179 
180 /* float compare support code */
181 typedef union {
182     float f;
183     uint32_t i;
184 } intfloat;
185 
186 static uint32_t xs_state[4];
187 
xor128_srand(unsigned seed)188 static void xor128_srand(unsigned seed) {
189     xs_state[0] = seed;
190     xs_state[1] = ( seed & 0xffff0000) | (~seed & 0x0000ffff);
191     xs_state[2] = (~seed & 0xffff0000) | ( seed & 0x0000ffff);
192     xs_state[3] = ~seed;
193 }
194 
195 // xor128 from Marsaglia, George (July 2003). "Xorshift RNGs".
196 //             Journal of Statistical Software. 8 (14).
197 //             doi:10.18637/jss.v008.i14.
xor128_rand(void)198 int xor128_rand(void) {
199     const uint32_t x = xs_state[0];
200     const uint32_t t = x ^ (x << 11);
201 
202     xs_state[0] = xs_state[1];
203     xs_state[1] = xs_state[2];
204     xs_state[2] = xs_state[3];
205     uint32_t w = xs_state[3];
206 
207     w = (w ^ (w >> 19)) ^ (t ^ (t >> 8));
208     xs_state[3] = w;
209 
210     return w >> 1;
211 }
212 
213 #if CONFIG_MACOS_KPERF
214 
215 static int (*kpc_get_thread_counters)(int, unsigned int, void *);
216 
217 #define CFGWORD_EL0A64EN_MASK (0x20000)
218 
219 #define CPMU_CORE_CYCLE 0x02
220 
221 #define KPC_CLASS_FIXED_MASK        (1 << 0)
222 #define KPC_CLASS_CONFIGURABLE_MASK (1 << 1)
223 
224 #define COUNTERS_COUNT 10
225 #define CONFIG_COUNT 8
226 #define KPC_MASK (KPC_CLASS_CONFIGURABLE_MASK | KPC_CLASS_FIXED_MASK)
227 
kperf_init(void)228 static int kperf_init(void) {
229     uint64_t config[COUNTERS_COUNT] = { 0 };
230 
231     void *kperf = dlopen("/System/Library/PrivateFrameworks/kperf.framework/kperf", RTLD_LAZY);
232     if (!kperf) {
233         fprintf(stderr, "checkasm: Unable to load kperf: %s\n", dlerror());
234         return 1;
235     }
236 
237     int (*kpc_force_all_ctrs_set)(int) = dlsym(kperf, "kpc_force_all_ctrs_set");
238     int (*kpc_set_counting)(uint32_t) = dlsym(kperf, "kpc_set_counting");
239     int (*kpc_set_thread_counting)(uint32_t) = dlsym(kperf, "kpc_set_thread_counting");
240     int (*kpc_set_config)(uint32_t, void *) = dlsym(kperf, "kpc_set_config");
241     uint32_t (*kpc_get_counter_count)(uint32_t) = dlsym(kperf, "kpc_get_counter_count");
242     uint32_t (*kpc_get_config_count)(uint32_t) = dlsym(kperf, "kpc_get_config_count");
243     kpc_get_thread_counters = dlsym(kperf, "kpc_get_thread_counters");
244 
245     if (!kpc_get_thread_counters) {
246         fprintf(stderr, "checkasm: Unable to load kpc_get_thread_counters\n");
247         return 1;
248     }
249 
250     if (!kpc_get_counter_count || kpc_get_counter_count(KPC_MASK) != COUNTERS_COUNT) {
251         fprintf(stderr, "checkasm: Unxpected kpc_get_counter_count\n");
252         return 1;
253     }
254     if (!kpc_get_config_count || kpc_get_config_count(KPC_MASK) != CONFIG_COUNT) {
255         fprintf(stderr, "checkasm: Unxpected kpc_get_config_count\n");
256         return 1;
257     }
258 
259     config[0] = CPMU_CORE_CYCLE | CFGWORD_EL0A64EN_MASK;
260 
261     if (!kpc_set_config || kpc_set_config(KPC_MASK, config)) {
262         fprintf(stderr, "checkasm: The kperf API needs to be run as root\n");
263         return 1;
264     }
265     if (!kpc_force_all_ctrs_set || kpc_force_all_ctrs_set(1)) {
266         fprintf(stderr, "checkasm: kpc_force_all_ctrs_set failed\n");
267         return 1;
268     }
269     if (!kpc_set_counting || kpc_set_counting(KPC_MASK)) {
270         fprintf(stderr, "checkasm: kpc_set_counting failed\n");
271         return 1;
272     }
273     if (!kpc_set_counting || kpc_set_thread_counting(KPC_MASK)) {
274         fprintf(stderr, "checkasm: kpc_set_thread_counting failed\n");
275         return 1;
276     }
277     return 0;
278 }
279 
checkasm_kperf_cycles(void)280 uint64_t checkasm_kperf_cycles(void) {
281     uint64_t counters[COUNTERS_COUNT];
282     if (kpc_get_thread_counters(0, COUNTERS_COUNT, counters))
283         return -1;
284 
285     return counters[0];
286 }
287 #endif
288 
is_negative(const intfloat u)289 static int is_negative(const intfloat u) {
290     return u.i >> 31;
291 }
292 
float_near_ulp(const float a,const float b,const unsigned max_ulp)293 int float_near_ulp(const float a, const float b, const unsigned max_ulp) {
294     intfloat x, y;
295 
296     x.f = a;
297     y.f = b;
298 
299     if (is_negative(x) != is_negative(y)) {
300         // handle -0.0 == +0.0
301         return a == b;
302     }
303 
304     if (llabs((int64_t)x.i - y.i) <= max_ulp)
305         return 1;
306 
307     return 0;
308 }
309 
float_near_ulp_array(const float * const a,const float * const b,const unsigned max_ulp,const int len)310 int float_near_ulp_array(const float *const a, const float *const b,
311                          const unsigned max_ulp, const int len)
312 {
313     for (int i = 0; i < len; i++)
314         if (!float_near_ulp(a[i], b[i], max_ulp))
315             return 0;
316 
317     return 1;
318 }
319 
float_near_abs_eps(const float a,const float b,const float eps)320 int float_near_abs_eps(const float a, const float b, const float eps) {
321     return fabsf(a - b) < eps;
322 }
323 
float_near_abs_eps_array(const float * const a,const float * const b,const float eps,const int len)324 int float_near_abs_eps_array(const float *const a, const float *const b,
325                              const float eps, const int len)
326 {
327     for (int i = 0; i < len; i++)
328         if (!float_near_abs_eps(a[i], b[i], eps))
329             return 0;
330 
331     return 1;
332 }
333 
float_near_abs_eps_ulp(const float a,const float b,const float eps,const unsigned max_ulp)334 int float_near_abs_eps_ulp(const float a, const float b, const float eps,
335                            const unsigned max_ulp)
336 {
337     return float_near_ulp(a, b, max_ulp) || float_near_abs_eps(a, b, eps);
338 }
339 
float_near_abs_eps_array_ulp(const float * const a,const float * const b,const float eps,const unsigned max_ulp,const int len)340 int float_near_abs_eps_array_ulp(const float *const a, const float *const b,
341                                  const float eps, const unsigned max_ulp,
342                                  const int len)
343 {
344     for (int i = 0; i < len; i++)
345         if (!float_near_abs_eps_ulp(a[i], b[i], eps, max_ulp))
346             return 0;
347 
348     return 1;
349 }
350 
351 /* Print colored text to stderr if the terminal supports it */
352 static int use_printf_color;
color_fprintf(FILE * const f,const int color,const char * const fmt,...)353 static void color_fprintf(FILE *const f, const int color, const char *const fmt, ...) {
354     va_list arg;
355 
356     if (use_printf_color)
357         fprintf(f, "\x1b[0;%dm", color);
358 
359     va_start(arg, fmt);
360     vfprintf(f, fmt, arg);
361     va_end(arg);
362 
363     if (use_printf_color)
364         fprintf(f, "\x1b[0m");
365 }
366 
367 /* Deallocate a tree */
destroy_func_tree(CheckasmFunc * const f)368 static void destroy_func_tree(CheckasmFunc *const f) {
369     if (f) {
370         CheckasmFuncVersion *v = f->versions.next;
371         while (v) {
372             CheckasmFuncVersion *next = v->next;
373             free(v);
374             v = next;
375         }
376 
377         destroy_func_tree(f->child[0]);
378         destroy_func_tree(f->child[1]);
379         free(f);
380     }
381 }
382 
383 /* Allocate a zero-initialized block, clean up and exit on failure */
checkasm_malloc(const size_t size)384 static void *checkasm_malloc(const size_t size) {
385     void *const ptr = calloc(1, size);
386     if (!ptr) {
387         fprintf(stderr, "checkasm: malloc failed\n");
388         destroy_func_tree(state.funcs);
389         exit(1);
390     }
391     return ptr;
392 }
393 
394 /* Get the suffix of the specified cpu flag */
cpu_suffix(const unsigned cpu)395 static const char *cpu_suffix(const unsigned cpu) {
396     for (int i = (int)(sizeof(cpus) / sizeof(*cpus)) - 2; i >= 0; i--)
397         if (cpu & cpus[i].flag)
398             return cpus[i].suffix;
399 
400     return "c";
401 }
402 
403 #ifdef readtime
cmp_nop(const void * a,const void * b)404 static int cmp_nop(const void *a, const void *b) {
405     return *(const uint16_t*)a - *(const uint16_t*)b;
406 }
407 
408 /* Measure the overhead of the timing code (in decicycles) */
measure_nop_time(void)409 static double measure_nop_time(void) {
410     uint16_t nops[10000];
411     int nop_sum = 0;
412 
413     for (int i = 0; i < 10000; i++) {
414         uint64_t t = readtime();
415         nops[i] = (uint16_t) (readtime() - t);
416     }
417 
418     qsort(nops, 10000, sizeof(uint16_t), cmp_nop);
419     for (int i = 2500; i < 7500; i++)
420         nop_sum += nops[i];
421 
422     return nop_sum / 5000.0;
423 }
424 
avg_cycles_per_call(const CheckasmFuncVersion * const v)425 static double avg_cycles_per_call(const CheckasmFuncVersion *const v) {
426     if (v->iterations) {
427         const double cycles = (double)v->cycles / v->iterations - state.nop_time;
428         if (cycles > 0.0)
429             return cycles / 4.0; /* 4 calls per iteration */
430     }
431     return 0.0;
432 }
433 
434 /* Print benchmark results */
print_benchs(const CheckasmFunc * const f)435 static void print_benchs(const CheckasmFunc *const f) {
436     if (f) {
437         print_benchs(f->child[0]);
438 
439         /* Only print functions with at least one assembly version */
440         const CheckasmFuncVersion *v = &f->versions;
441         if (v->iterations) {
442             const double baseline = avg_cycles_per_call(v);
443             do {
444                 const int pad_length = 10 + state.max_function_name_length -
445                     printf("%s_%s:", f->name, cpu_suffix(v->cpu));
446                 const double cycles = avg_cycles_per_call(v);
447                 const double ratio = cycles ? baseline / cycles : 0.0;
448                 printf("%*.1f (%5.2fx)\n", imax(pad_length, 0), cycles, ratio);
449             } while ((v = v->next));
450         }
451 
452         print_benchs(f->child[1]);
453     }
454 }
455 #endif
456 
print_functions(const CheckasmFunc * const f)457 static void print_functions(const CheckasmFunc *const f) {
458     if (f) {
459         print_functions(f->child[0]);
460         const CheckasmFuncVersion *v = &f->versions;
461         printf("%s (%s", f->name, cpu_suffix(v->cpu));
462         while ((v = v->next))
463             printf(", %s", cpu_suffix(v->cpu));
464         printf(")\n");
465         print_functions(f->child[1]);
466     }
467 }
468 
469 #define is_digit(x) ((x) >= '0' && (x) <= '9')
470 
471 /* ASCIIbetical sort except preserving natural order for numbers */
cmp_func_names(const char * a,const char * b)472 static int cmp_func_names(const char *a, const char *b) {
473     const char *const start = a;
474     int ascii_diff, digit_diff;
475 
476     for (; !(ascii_diff = *(const unsigned char*)a -
477                           *(const unsigned char*)b) && *a; a++, b++);
478     for (; is_digit(*a) && is_digit(*b); a++, b++);
479 
480     if (a > start && is_digit(a[-1]) &&
481         (digit_diff = is_digit(*a) - is_digit(*b)))
482     {
483         return digit_diff;
484     }
485 
486     return ascii_diff;
487 }
488 
489 /* Perform a tree rotation in the specified direction and return the new root */
rotate_tree(CheckasmFunc * const f,const int dir)490 static CheckasmFunc *rotate_tree(CheckasmFunc *const f, const int dir) {
491     CheckasmFunc *const r = f->child[dir^1];
492     f->child[dir^1] = r->child[dir];
493     r->child[dir] = f;
494     r->color = f->color;
495     f->color = 0;
496     return r;
497 }
498 
499 #define is_red(f) ((f) && !(f)->color)
500 
501 /* Balance a left-leaning red-black tree at the specified node */
balance_tree(CheckasmFunc ** const root)502 static void balance_tree(CheckasmFunc **const root) {
503     CheckasmFunc *const f = *root;
504 
505     if (is_red(f->child[0]) && is_red(f->child[1])) {
506         f->color ^= 1;
507         f->child[0]->color = f->child[1]->color = 1;
508     }
509     else if (!is_red(f->child[0]) && is_red(f->child[1]))
510         *root = rotate_tree(f, 0); /* Rotate left */
511     else if (is_red(f->child[0]) && is_red(f->child[0]->child[0]))
512         *root = rotate_tree(f, 1); /* Rotate right */
513 }
514 
515 /* Get a node with the specified name, creating it if it doesn't exist */
get_func(CheckasmFunc ** const root,const char * const name)516 static CheckasmFunc *get_func(CheckasmFunc **const root, const char *const name) {
517     CheckasmFunc *f = *root;
518 
519     if (f) {
520         /* Search the tree for a matching node */
521         const int cmp = cmp_func_names(name, f->name);
522         if (cmp) {
523             f = get_func(&f->child[cmp > 0], name);
524 
525             /* Rebalance the tree on the way up if a new node was inserted */
526             if (!f->versions.func)
527                 balance_tree(root);
528         }
529     } else {
530         /* Allocate and insert a new node into the tree */
531         const size_t name_length = strlen(name) + 1;
532         f = *root = checkasm_malloc(offsetof(CheckasmFunc, name) + name_length);
533         memcpy(f->name, name, name_length);
534     }
535 
536     return f;
537 }
538 
539 checkasm_context checkasm_context_buf;
540 
541 /* Crash handling: attempt to catch crashes and handle them
542  * gracefully instead of just aborting abruptly. */
543 #ifdef _WIN32
544 #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
signal_handler(EXCEPTION_POINTERS * const e)545 static LONG NTAPI signal_handler(EXCEPTION_POINTERS *const e) {
546     if (state.sig == SIG_ATOMIC_MAX) {
547         int s;
548         switch (e->ExceptionRecord->ExceptionCode) {
549         case EXCEPTION_FLT_DIVIDE_BY_ZERO:
550         case EXCEPTION_INT_DIVIDE_BY_ZERO:
551             s = SIGFPE;
552             break;
553         case EXCEPTION_ILLEGAL_INSTRUCTION:
554         case EXCEPTION_PRIV_INSTRUCTION:
555             s = SIGILL;
556             break;
557         case EXCEPTION_ACCESS_VIOLATION:
558         case EXCEPTION_ARRAY_BOUNDS_EXCEEDED:
559         case EXCEPTION_DATATYPE_MISALIGNMENT:
560         case EXCEPTION_STACK_OVERFLOW:
561             s = SIGSEGV;
562             break;
563         case EXCEPTION_IN_PAGE_ERROR:
564             s = SIGBUS;
565             break;
566         default:
567             return EXCEPTION_CONTINUE_SEARCH;
568         }
569         state.sig = s;
570         checkasm_load_context();
571     }
572     return EXCEPTION_CONTINUE_SEARCH;
573 }
574 #endif
575 #else
576 static void signal_handler(int s);
577 
578 static const struct sigaction signal_handler_act = {
579     .sa_handler = signal_handler,
580     .sa_flags = SA_RESETHAND,
581 };
582 
signal_handler(const int s)583 static void signal_handler(const int s) {
584     if (state.sig == SIG_ATOMIC_MAX) {
585         state.sig = s;
586         sigaction(s, &signal_handler_act, NULL);
587         checkasm_load_context();
588     }
589 }
590 #endif
591 
592 /* Compares a string with a wildcard pattern. */
wildstrcmp(const char * str,const char * pattern)593 static int wildstrcmp(const char *str, const char *pattern) {
594     const char *wild = strchr(pattern, '*');
595     if (wild) {
596         const size_t len = wild - pattern;
597         if (strncmp(str, pattern, len)) return 1;
598         while (*++wild == '*');
599         if (!*wild) return 0;
600         str += len;
601         while (*str && wildstrcmp(str, wild)) str++;
602         return !*str;
603     }
604     return strcmp(str, pattern);
605 }
606 
607 /* Perform tests and benchmarks for the specified
608  * cpu flag if supported by the host */
check_cpu_flag(const char * const name,unsigned flag)609 static void check_cpu_flag(const char *const name, unsigned flag) {
610     const unsigned old_cpu_flag = state.cpu_flag;
611 
612     flag |= old_cpu_flag;
613     dav1d_set_cpu_flags_mask(flag);
614     state.cpu_flag = dav1d_get_cpu_flags();
615 
616     if (!flag || state.cpu_flag != old_cpu_flag) {
617         state.cpu_flag_name = name;
618         state.suffix_length = (int)strlen(cpu_suffix(flag)) + 1;
619         for (int i = 0; tests[i].func; i++) {
620             if (state.test_pattern && wildstrcmp(tests[i].name, state.test_pattern))
621                 continue;
622             xor128_srand(state.seed);
623             state.current_test_name = tests[i].name;
624             tests[i].func();
625         }
626     }
627 }
628 
629 /* Print the name of the current CPU flag, but only do it once */
print_cpu_name(void)630 static void print_cpu_name(void) {
631     if (state.cpu_flag_name) {
632         color_fprintf(stderr, COLOR_YELLOW, "%s:\n", state.cpu_flag_name);
633         state.cpu_flag_name = NULL;
634     }
635 }
636 
get_seed(void)637 static unsigned get_seed(void) {
638 #ifdef _WIN32
639     LARGE_INTEGER i;
640     QueryPerformanceCounter(&i);
641     return i.LowPart;
642 #elif defined(__APPLE__)
643     return (unsigned) mach_absolute_time();
644 #else
645     struct timespec ts;
646     clock_gettime(CLOCK_MONOTONIC, &ts);
647     return (unsigned) (1000000000ULL * ts.tv_sec + ts.tv_nsec);
648 #endif
649 }
650 
checkasm_strtoul(unsigned long * const dst,const char * const str,const int base)651 static int checkasm_strtoul(unsigned long *const dst, const char *const str, const int base) {
652     char *end;
653     errno = 0;
654     *dst = strtoul(str, &end, base);
655     return errno || end == str || *end;
656 }
657 
main(int argc,char * argv[])658 int main(int argc, char *argv[]) {
659     state.seed = get_seed();
660 
661     while (argc > 1) {
662         if (!strncmp(argv[1], "--help", 6) || !strcmp(argv[1], "-h")) {
663             fprintf(stderr,
664                     "checkasm [options] <random seed>\n"
665                     "    <random seed>              Numeric value to seed the rng\n"
666                     "Options:\n"
667                     "    --affinity=<cpu>           Run the process on CPU <cpu>\n"
668                     "    --test=<pattern> -t        Test only <pattern>\n"
669                     "    --function=<pattern> -f    Test only the functions matching <pattern>\n"
670                     "    --bench -b                 Benchmark the tested functions\n"
671                     "    --list-cpuflags            List available cpu flags\n"
672                     "    --list-functions           List available functions\n"
673                     "    --list-tests               List available tests\n"
674                     "    --verbose -v               Print verbose output\n");
675             return 0;
676         } else if (!strcmp(argv[1], "--bench") || !strcmp(argv[1], "-b")) {
677 #ifndef readtime
678             fprintf(stderr,
679                     "checkasm: --bench is not supported on your system\n");
680             return 1;
681 #endif
682             state.run_mode = RUN_BENCHMARK;
683         } else if (!strncmp(argv[1], "--test=", 7)) {
684             state.test_pattern = argv[1] + 7;
685         } else if (!strcmp(argv[1], "-t")) {
686             state.test_pattern = argc > 1 ? argv[2] : "";
687             argc--;
688             argv++;
689         } else if (!strncmp(argv[1], "--function=", 11)) {
690             state.function_pattern = argv[1] + 11;
691         } else if (!strcmp(argv[1], "-f")) {
692             state.function_pattern = argc > 1 ? argv[2] : "";
693             argc--;
694             argv++;
695         } else if (!strcmp(argv[1], "--list-cpuflags")) {
696             state.run_mode = RUN_CPUFLAG_LISTING;
697             break;
698         } else if (!strcmp(argv[1], "--list-functions")) {
699             state.run_mode = RUN_FUNCTION_LISTING;
700         } else if (!strcmp(argv[1], "--list-tests")) {
701             for (int i = 0; tests[i].name; i++)
702                 printf("%s\n", tests[i].name);
703             return 0;
704         } else if (!strcmp(argv[1], "--verbose") || !strcmp(argv[1], "-v")) {
705             state.verbose = 1;
706         } else if (!strncmp(argv[1], "--affinity=", 11)) {
707             const char *const s = argv[1] + 11;
708             unsigned long affinity;
709             if (checkasm_strtoul(&affinity, s, 16)) {
710                 fprintf(stderr, "checkasm: invalid cpu affinity (%s)\n", s);
711                 return 1;
712             }
713 #ifdef _WIN32
714             int affinity_err;
715             HANDLE process = GetCurrentProcess();
716 #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
717             BOOL (WINAPI *spdcs)(HANDLE, const ULONG*, ULONG) =
718                 (void*)GetProcAddress(GetModuleHandleW(L"kernel32.dll"), "SetProcessDefaultCpuSets");
719             if (spdcs)
720                 affinity_err = !spdcs(process, (ULONG[]){ affinity + 256 }, 1);
721             else
722 #endif
723             {
724                 if (affinity < sizeof(DWORD_PTR) * 8)
725                     affinity_err = !SetProcessAffinityMask(process, (DWORD_PTR)1 << affinity);
726                 else
727                     affinity_err = 1;
728             }
729             if (affinity_err) {
730                 fprintf(stderr, "checkasm: invalid cpu affinity (%lu)\n", affinity);
731                 return 1;
732             } else {
733                 fprintf(stderr, "checkasm: running on cpu %lu\n", affinity);
734             }
735 #elif defined(HAVE_PTHREAD_SETAFFINITY_NP) && defined(CPU_SET)
736             cpu_set_t set;
737             CPU_ZERO(&set);
738             CPU_SET(affinity, &set);
739             if (pthread_setaffinity_np(pthread_self(), sizeof(set), &set)) {
740                 fprintf(stderr, "checkasm: invalid cpu affinity (%lu)\n", affinity);
741                 return 1;
742             } else {
743                 fprintf(stderr, "checkasm: running on cpu %lu\n", affinity);
744             }
745 #else
746             (void)affinity;
747             fprintf(stderr,
748                     "checkasm: --affinity is not supported on your system\n");
749             return 1;
750 #endif
751         } else {
752             unsigned long seed;
753             if (checkasm_strtoul(&seed, argv[1], 10)) {
754                 fprintf(stderr, "checkasm: unknown option (%s)\n", argv[1]);
755                 return 1;
756             }
757             state.seed = (unsigned)seed;
758         }
759 
760         argc--;
761         argv++;
762     }
763 
764 #if TRIM_DSP_FUNCTIONS
765     fprintf(stderr, "checkasm: reference functions unavailable, reconfigure using '-Dtrim_dsp=false'\n");
766     return 0;
767 #endif
768 
769     dav1d_init_cpu();
770 
771 #ifdef _WIN32
772 #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
773     AddVectoredExceptionHandler(0, signal_handler);
774 
775     HANDLE con = GetStdHandle(state.run_mode >= RUN_CPUFLAG_LISTING ?
776                               STD_OUTPUT_HANDLE : STD_ERROR_HANDLE);
777     DWORD con_mode = 0;
778     use_printf_color = con && con != INVALID_HANDLE_VALUE &&
779                        GetConsoleMode(con, &con_mode) &&
780                        SetConsoleMode(con, con_mode | ENABLE_VIRTUAL_TERMINAL_PROCESSING);
781 #endif
782 #else
783     sigaction(SIGBUS,  &signal_handler_act, NULL);
784     sigaction(SIGFPE,  &signal_handler_act, NULL);
785     sigaction(SIGILL,  &signal_handler_act, NULL);
786     sigaction(SIGSEGV, &signal_handler_act, NULL);
787 
788     if (isatty(state.run_mode >= RUN_CPUFLAG_LISTING ? 1 : 2)) {
789         const char *const term = getenv("TERM");
790         use_printf_color = term && strcmp(term, "dumb");
791     }
792 #endif
793 
794 #ifdef readtime
795     if (state.run_mode == RUN_BENCHMARK) {
796 #if CONFIG_MACOS_KPERF
797         if (kperf_init())
798             return 1;
799 #endif
800         if (!checkasm_save_context()) {
801             checkasm_set_signal_handler_state(1);
802             readtime();
803             checkasm_set_signal_handler_state(0);
804         } else {
805             fprintf(stderr, "checkasm: unable to access cycle counter\n");
806             return 1;
807         }
808     }
809 #endif
810 
811     int ret = 0;
812 
813     if (state.run_mode != RUN_FUNCTION_LISTING) {
814         const unsigned cpu_flags = dav1d_get_cpu_flags();
815         if (state.run_mode == RUN_CPUFLAG_LISTING) {
816             const int last_i = (int)(sizeof(cpus) / sizeof(*cpus)) - 2;
817             for (int i = 0; i <= last_i ; i++) {
818                 if (cpus[i].flag & cpu_flags)
819                     color_fprintf(stdout, COLOR_GREEN, "%s", cpus[i].suffix);
820                 else
821                     color_fprintf(stdout, COLOR_RED, "~%s", cpus[i].suffix);
822                 printf(i == last_i ? "\n" : ", ");
823             }
824             return 0;
825         }
826 #if ARCH_X86_64
827         void checkasm_warmup_avx2(void);
828         void checkasm_warmup_avx512(void);
829         if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX512ICL)
830             state.simd_warmup = checkasm_warmup_avx512;
831         else if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX2)
832             state.simd_warmup = checkasm_warmup_avx2;
833         checkasm_simd_warmup();
834 #endif
835 #if ARCH_X86
836         unsigned checkasm_init_x86(char *name);
837         char name[48];
838         const unsigned cpuid = checkasm_init_x86(name);
839         for (size_t len = strlen(name); len && name[len-1] == ' '; len--)
840             name[len-1] = '\0'; /* trim trailing whitespace */
841         fprintf(stderr, "checkasm: %s (%08X) using random seed %u\n", name, cpuid, state.seed);
842 #elif ARCH_RISCV
843         char buf[32] = "";
844         if (cpu_flags & DAV1D_RISCV_CPU_FLAG_V) {
845             const int vlen = 8*checkasm_get_vlenb();
846             snprintf(buf, sizeof(buf), "VLEN=%i bits, ", vlen);
847         }
848         fprintf(stderr, "checkasm: %susing random seed %u\n", buf, state.seed);
849 #elif ARCH_AARCH64 && HAVE_SVE
850         char buf[48] = "";
851         if (cpu_flags & DAV1D_ARM_CPU_FLAG_SVE)
852             snprintf(buf, sizeof(buf), "SVE %d bits, ", checkasm_sve_length());
853         fprintf(stderr, "checkasm: %susing random seed %u\n", buf, state.seed);
854 #else
855         fprintf(stderr, "checkasm: using random seed %u\n", state.seed);
856 #endif
857     }
858 
859     check_cpu_flag(NULL, 0);
860     for (int i = 0; cpus[i].flag; i++)
861         check_cpu_flag(cpus[i].name, cpus[i].flag);
862 
863     if (state.run_mode == RUN_FUNCTION_LISTING) {
864         print_functions(state.funcs);
865     } else if (state.num_failed) {
866         fprintf(stderr, "checkasm: %d of %d tests failed\n",
867                 state.num_failed, state.num_checked);
868         ret = 1;
869     } else {
870         if (state.num_checked)
871             fprintf(stderr, "checkasm: all %d tests passed\n", state.num_checked);
872         else
873             fprintf(stderr, "checkasm: no tests to perform\n");
874 #ifdef readtime
875         if (state.run_mode == RUN_BENCHMARK && state.max_function_name_length) {
876             state.nop_time = measure_nop_time();
877             if (state.verbose)
878                 printf("nop:%*.1f\n", state.max_function_name_length + 6, state.nop_time);
879             print_benchs(state.funcs);
880         }
881 #endif
882     }
883 
884     destroy_func_tree(state.funcs);
885     return ret;
886 }
887 
888 /* Decide whether or not the specified function needs to be tested and
889  * allocate/initialize data structures if needed. Returns a pointer to a
890  * reference function if the function should be tested, otherwise NULL */
checkasm_check_func(void * const func,const char * const name,...)891 void *checkasm_check_func(void *const func, const char *const name, ...) {
892     char name_buf[256];
893     va_list arg;
894 
895     va_start(arg, name);
896     int name_length = vsnprintf(name_buf, sizeof(name_buf), name, arg);
897     va_end(arg);
898 
899     if (!func || name_length <= 0 || (size_t)name_length >= sizeof(name_buf) ||
900         (state.function_pattern && wildstrcmp(name_buf, state.function_pattern)))
901     {
902         return NULL;
903     }
904 
905     state.current_func = get_func(&state.funcs, name_buf);
906 
907     state.funcs->color = 1;
908     CheckasmFuncVersion *v = &state.current_func->versions;
909     void *ref = func;
910 
911     if (v->func) {
912         CheckasmFuncVersion *prev;
913         do {
914             /* Only test functions that haven't already been tested */
915             if (v->func == func)
916                 return NULL;
917 
918             if (v->ok)
919                 ref = v->func;
920 
921             prev = v;
922         } while ((v = v->next));
923 
924         v = prev->next = checkasm_malloc(sizeof(CheckasmFuncVersion));
925     }
926 
927     name_length += state.suffix_length;
928     if (name_length > state.max_function_name_length)
929         state.max_function_name_length = name_length;
930 
931     v->func = func;
932     v->ok = 1;
933     v->cpu = state.cpu_flag;
934     state.current_func_ver = v;
935     if (state.run_mode == RUN_FUNCTION_LISTING) /* Save function names without running tests */
936         return NULL;
937 
938     xor128_srand(state.seed);
939 
940     if (state.cpu_flag)
941         state.num_checked++;
942 
943     return ref;
944 }
945 
946 /* Decide whether or not the current function needs to be benchmarked */
checkasm_bench_func(void)947 int checkasm_bench_func(void) {
948     return !state.num_failed && state.run_mode == RUN_BENCHMARK;
949 }
950 
951 /* Indicate that the current test has failed, return whether verbose printing
952  * is requested. */
checkasm_fail_func(const char * const msg,...)953 int checkasm_fail_func(const char *const msg, ...) {
954     if (state.current_func_ver && state.current_func_ver->cpu &&
955         state.current_func_ver->ok)
956     {
957         va_list arg;
958 
959         print_cpu_name();
960         fprintf(stderr, "   %s_%s (", state.current_func->name,
961                 cpu_suffix(state.current_func_ver->cpu));
962         va_start(arg, msg);
963         vfprintf(stderr, msg, arg);
964         va_end(arg);
965         fprintf(stderr, ")\n");
966 
967         state.current_func_ver->ok = 0;
968         state.num_failed++;
969     }
970     return state.verbose;
971 }
972 
973 /* Update benchmark results of the current function */
checkasm_update_bench(const int iterations,const uint64_t cycles)974 void checkasm_update_bench(const int iterations, const uint64_t cycles) {
975     state.current_func_ver->iterations += iterations;
976     state.current_func_ver->cycles += cycles;
977 }
978 
979 /* Print the outcome of all tests performed since
980  * the last time this function was called */
checkasm_report(const char * const name,...)981 void checkasm_report(const char *const name, ...) {
982     static int prev_checked, prev_failed;
983     static size_t max_length;
984 
985     if (state.num_checked > prev_checked) {
986         int pad_length = (int) max_length + 4;
987         va_list arg;
988 
989         print_cpu_name();
990         pad_length -= fprintf(stderr, " - %s.", state.current_test_name);
991         va_start(arg, name);
992         pad_length -= vfprintf(stderr, name, arg);
993         va_end(arg);
994         fprintf(stderr, "%*c", imax(pad_length, 0) + 2, '[');
995 
996         if (state.num_failed == prev_failed)
997             color_fprintf(stderr, COLOR_GREEN, "OK");
998         else
999             color_fprintf(stderr, COLOR_RED, "FAILED");
1000         fprintf(stderr, "]\n");
1001 
1002         prev_checked = state.num_checked;
1003         prev_failed  = state.num_failed;
1004     } else if (!state.cpu_flag) {
1005         /* Calculate the amount of padding required
1006          * to make the output vertically aligned */
1007         size_t length = strlen(state.current_test_name);
1008         va_list arg;
1009 
1010         va_start(arg, name);
1011         length += vsnprintf(NULL, 0, name, arg);
1012         va_end(arg);
1013 
1014         if (length > max_length)
1015             max_length = length;
1016     }
1017 }
1018 
checkasm_set_signal_handler_state(const int enabled)1019 void checkasm_set_signal_handler_state(const int enabled) {
1020     state.sig = enabled ? SIG_ATOMIC_MAX : 0;
1021 }
1022 
checkasm_handle_signal(void)1023 void checkasm_handle_signal(void) {
1024     const int s = state.sig;
1025     checkasm_fail_func(s == SIGFPE ? "fatal arithmetic error" :
1026                        s == SIGILL ? "illegal instruction" :
1027                        s == SIGBUS ? "bus error" :
1028                                      "segmentation fault");
1029 }
1030 
check_err(const char * const file,const int line,const char * const name,const int w,const int h,int * const err)1031 static int check_err(const char *const file, const int line,
1032                      const char *const name, const int w, const int h,
1033                      int *const err)
1034 {
1035     if (*err)
1036         return 0;
1037     if (!checkasm_fail_func("%s:%d", file, line))
1038         return 1;
1039     *err = 1;
1040     fprintf(stderr, "%s (%dx%d):\n", name, w, h);
1041     return 0;
1042 }
1043 
1044 #define DEF_CHECKASM_CHECK_FUNC(type, fmt) \
1045 int checkasm_check_##type(const char *const file, const int line, \
1046                           const type *buf1, ptrdiff_t stride1, \
1047                           const type *buf2, ptrdiff_t stride2, \
1048                           const int w, int h, const char *const name, \
1049                           const int align_w, const int align_h, \
1050                           const int padding) \
1051 { \
1052     int aligned_w = (w + align_w - 1) & ~(align_w - 1); \
1053     int aligned_h = (h + align_h - 1) & ~(align_h - 1); \
1054     int err = 0; \
1055     stride1 /= sizeof(*buf1); \
1056     stride2 /= sizeof(*buf2); \
1057     int y = 0; \
1058     for (y = 0; y < h; y++) \
1059         if (memcmp(&buf1[y*stride1], &buf2[y*stride2], w*sizeof(*buf1))) \
1060             break; \
1061     if (y != h) { \
1062         if (check_err(file, line, name, w, h, &err)) \
1063             return 1; \
1064         for (y = 0; y < h; y++) { \
1065             for (int x = 0; x < w; x++) \
1066                 fprintf(stderr, " " fmt, buf1[x]); \
1067             fprintf(stderr, "    "); \
1068             for (int x = 0; x < w; x++) \
1069                 fprintf(stderr, " " fmt, buf2[x]); \
1070             fprintf(stderr, "    "); \
1071             for (int x = 0; x < w; x++) \
1072                 fprintf(stderr, "%c", buf1[x] != buf2[x] ? 'x' : '.'); \
1073             buf1 += stride1; \
1074             buf2 += stride2; \
1075             fprintf(stderr, "\n"); \
1076         } \
1077         buf1 -= h*stride1; \
1078         buf2 -= h*stride2; \
1079     } \
1080     for (y = -padding; y < 0; y++) \
1081         if (memcmp(&buf1[y*stride1 - padding], &buf2[y*stride2 - padding], \
1082                    (w + 2*padding)*sizeof(*buf1))) { \
1083             if (check_err(file, line, name, w, h, &err)) \
1084                 return 1; \
1085             fprintf(stderr, " overwrite above\n"); \
1086             break; \
1087         } \
1088     for (y = aligned_h; y < aligned_h + padding; y++) \
1089         if (memcmp(&buf1[y*stride1 - padding], &buf2[y*stride2 - padding], \
1090                    (w + 2*padding)*sizeof(*buf1))) { \
1091             if (check_err(file, line, name, w, h, &err)) \
1092                 return 1; \
1093             fprintf(stderr, " overwrite below\n"); \
1094             break; \
1095         } \
1096     for (y = 0; y < h; y++) \
1097         if (memcmp(&buf1[y*stride1 - padding], &buf2[y*stride2 - padding], \
1098                    padding*sizeof(*buf1))) { \
1099             if (check_err(file, line, name, w, h, &err)) \
1100                 return 1; \
1101             fprintf(stderr, " overwrite left\n"); \
1102             break; \
1103         } \
1104     for (y = 0; y < h; y++) \
1105         if (memcmp(&buf1[y*stride1 + aligned_w], &buf2[y*stride2 + aligned_w], \
1106                    padding*sizeof(*buf1))) { \
1107             if (check_err(file, line, name, w, h, &err)) \
1108                 return 1; \
1109             fprintf(stderr, " overwrite right\n"); \
1110             break; \
1111         } \
1112     return err; \
1113 }
1114 
1115 DEF_CHECKASM_CHECK_FUNC(int8_t,   "%4d")
1116 DEF_CHECKASM_CHECK_FUNC(int16_t,  "%6d")
1117 DEF_CHECKASM_CHECK_FUNC(int32_t,  "%9d")
1118 DEF_CHECKASM_CHECK_FUNC(uint8_t,  "%02x")
1119 DEF_CHECKASM_CHECK_FUNC(uint16_t, "%04x")
1120 DEF_CHECKASM_CHECK_FUNC(uint32_t, "%08x")
1121 
1122 #if ARCH_X86_64
checkasm_simd_warmup(void)1123 void checkasm_simd_warmup(void)
1124 {
1125     if (state.simd_warmup)
1126         state.simd_warmup();
1127 }
1128 #endif
1129