• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**************************************************************************
2  *
3  * Copyright 2008 Dennis Smit
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * on the rights to use, copy, modify, merge, publish, distribute, sub
10  * license, and/or sell copies of the Software, and to permit persons to whom
11  * the Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
20  * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
21  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
22  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
23  * USE OR OTHER DEALINGS IN THE SOFTWARE.
24  *
25  **************************************************************************/
26 
27 /**
28  * @file
29  * CPU feature detection.
30  *
31  * @author Dennis Smit
32  * @author Based on the work of Eric Anholt <anholt@FreeBSD.org>
33  */
34 
35 #include "pipe/p_config.h"
36 #include "pipe/p_compiler.h"
37 
38 #include "util/u_debug.h"
39 #include "u_cpu_detect.h"
40 #include "u_math.h"
41 #include "c11/threads.h"
42 
43 #include <stdio.h>
44 #include <inttypes.h>
45 
46 #if defined(PIPE_ARCH_PPC)
47 #if defined(PIPE_OS_APPLE)
48 #include <sys/sysctl.h>
49 #else
50 #include <signal.h>
51 #include <setjmp.h>
52 #endif
53 #endif
54 
55 #if defined(PIPE_OS_BSD)
56 #include <sys/param.h>
57 #include <sys/sysctl.h>
58 #include <machine/cpu.h>
59 #endif
60 
61 #if defined(PIPE_OS_FREEBSD)
62 #if __has_include(<sys/auxv.h>)
63 #include <sys/auxv.h>
64 #define HAVE_ELF_AUX_INFO
65 #endif
66 #endif
67 
68 #if defined(PIPE_OS_LINUX)
69 #include <signal.h>
70 #include <fcntl.h>
71 #include <elf.h>
72 #endif
73 
74 #ifdef PIPE_OS_UNIX
75 #include <unistd.h>
76 #endif
77 
78 #if defined(HAS_ANDROID_CPUFEATURES)
79 #include <cpu-features.h>
80 #endif
81 
82 #if defined(PIPE_OS_WINDOWS)
83 #include <windows.h>
84 #if defined(PIPE_CC_MSVC)
85 #include <intrin.h>
86 #endif
87 #endif
88 
89 
90 DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", false)
91 
92 
93 struct util_cpu_caps util_cpu_caps;
94 
95 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
96 static int has_cpuid(void);
97 #endif
98 
99 
100 #if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_APPLE) && !defined(PIPE_OS_BSD) && !defined(PIPE_OS_LINUX)
101 static jmp_buf  __lv_powerpc_jmpbuf;
102 static volatile sig_atomic_t __lv_powerpc_canjump = 0;
103 
104 static void
sigill_handler(int sig)105 sigill_handler(int sig)
106 {
107    if (!__lv_powerpc_canjump) {
108       signal (sig, SIG_DFL);
109       raise (sig);
110    }
111 
112    __lv_powerpc_canjump = 0;
113    longjmp(__lv_powerpc_jmpbuf, 1);
114 }
115 #endif
116 
117 #if defined(PIPE_ARCH_PPC)
118 static void
check_os_altivec_support(void)119 check_os_altivec_support(void)
120 {
121 #if defined(__ALTIVEC__)
122    util_cpu_caps.has_altivec = 1;
123 #endif
124 #if defined(__VSX__)
125    util_cpu_caps.has_vsx = 1;
126 #endif
127 #if defined(__ALTIVEC__) && defined(__VSX__)
128 /* Do nothing */
129 #elif defined(PIPE_OS_APPLE) || defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD)
130 #ifdef HW_VECTORUNIT
131    int sels[2] = {CTL_HW, HW_VECTORUNIT};
132 #else
133    int sels[2] = {CTL_MACHDEP, CPU_ALTIVEC};
134 #endif
135    int has_vu = 0;
136    int len = sizeof (has_vu);
137    int err;
138 
139    err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
140 
141    if (err == 0) {
142       if (has_vu != 0) {
143          util_cpu_caps.has_altivec = 1;
144       }
145    }
146 #elif defined(PIPE_OS_FREEBSD) /* !PIPE_OS_APPLE && !PIPE_OS_NETBSD && !PIPE_OS_OPENBSD */
147    unsigned long hwcap = 0;
148 #ifdef HAVE_ELF_AUX_INFO
149    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
150 #else
151    size_t len = sizeof(hwcap);
152    sysctlbyname("hw.cpu_features", &hwcap, &len, NULL, 0);
153 #endif
154    if (hwcap & PPC_FEATURE_HAS_ALTIVEC)
155       util_cpu_caps.has_altivec = 1;
156    if (hwcap & PPC_FEATURE_HAS_VSX)
157       util_cpu_caps.has_vsx = 1;
158 #elif defined(PIPE_OS_LINUX) /* !PIPE_OS_FREEBSD */
159 #if defined(PIPE_ARCH_PPC_64)
160     Elf64_auxv_t aux;
161 #else
162     Elf32_auxv_t aux;
163 #endif
164     int fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
165     if (fd >= 0) {
166        while (read(fd, &aux, sizeof(aux)) == sizeof(aux)) {
167           if (aux.a_type == AT_HWCAP) {
168              char *env_vsx = getenv("GALLIVM_VSX");
169              uint64_t hwcap = aux.a_un.a_val;
170              util_cpu_caps.has_altivec = (hwcap >> 28) & 1;
171              if (!env_vsx || env_vsx[0] != '0') {
172                 util_cpu_caps.has_vsx  = (hwcap >>  7) & 1;
173              }
174              break;
175           }
176        }
177        close(fd);
178     }
179 #else /* !PIPE_OS_APPLE && !PIPE_OS_BSD && !PIPE_OS_LINUX */
180    /* not on Apple/Darwin or Linux, do it the brute-force way */
181    /* this is borrowed from the libmpeg2 library */
182    signal(SIGILL, sigill_handler);
183    if (setjmp(__lv_powerpc_jmpbuf)) {
184       signal(SIGILL, SIG_DFL);
185    } else {
186       boolean enable_altivec = TRUE;    /* Default: enable  if available, and if not overridden */
187       boolean enable_vsx = TRUE;
188 #ifdef DEBUG
189       /* Disabling Altivec code generation is not the same as disabling VSX code generation,
190        * which can be done simply by passing -mattr=-vsx to the LLVM compiler; cf.
191        * lp_build_create_jit_compiler_for_module().
192        * If you want to disable Altivec code generation, the best place to do it is here.
193        */
194       char *env_control = getenv("GALLIVM_ALTIVEC");    /* 1=enable (default); 0=disable */
195       if (env_control && env_control[0] == '0') {
196          enable_altivec = FALSE;
197       }
198 #endif
199       /* VSX instructions can be explicitly enabled/disabled via GALLIVM_VSX=1 or 0 */
200       char *env_vsx = getenv("GALLIVM_VSX");
201       if (env_vsx && env_vsx[0] == '0') {
202          enable_vsx = FALSE;
203       }
204       if (enable_altivec) {
205          __lv_powerpc_canjump = 1;
206 
207          __asm __volatile
208             ("mtspr 256, %0\n\t"
209              "vand %%v0, %%v0, %%v0"
210              :
211              : "r" (-1));
212 
213          util_cpu_caps.has_altivec = 1;
214 
215          if (enable_vsx) {
216             __asm __volatile("xxland %vs0, %vs0, %vs0");
217             util_cpu_caps.has_vsx = 1;
218          }
219          signal(SIGILL, SIG_DFL);
220       } else {
221          util_cpu_caps.has_altivec = 0;
222       }
223    }
224 #endif /* !PIPE_OS_APPLE && !PIPE_OS_LINUX */
225 }
226 #endif /* PIPE_ARCH_PPC */
227 
228 
229 #if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64)
has_cpuid(void)230 static int has_cpuid(void)
231 {
232 #if defined(PIPE_ARCH_X86)
233 #if defined(PIPE_OS_GCC)
234    int a, c;
235 
236    __asm __volatile
237       ("pushf\n"
238        "popl %0\n"
239        "movl %0, %1\n"
240        "xorl $0x200000, %0\n"
241        "push %0\n"
242        "popf\n"
243        "pushf\n"
244        "popl %0\n"
245        : "=a" (a), "=c" (c)
246        :
247        : "cc");
248 
249    return a != c;
250 #else
251    /* FIXME */
252    return 1;
253 #endif
254 #elif defined(PIPE_ARCH_X86_64)
255    return 1;
256 #else
257    return 0;
258 #endif
259 }
260 
261 
262 /**
263  * @sa cpuid.h included in gcc-4.3 onwards.
264  * @sa http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
265  */
266 static inline void
cpuid(uint32_t ax,uint32_t * p)267 cpuid(uint32_t ax, uint32_t *p)
268 {
269 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86)
270    __asm __volatile (
271      "xchgl %%ebx, %1\n\t"
272      "cpuid\n\t"
273      "xchgl %%ebx, %1"
274      : "=a" (p[0]),
275        "=S" (p[1]),
276        "=c" (p[2]),
277        "=d" (p[3])
278      : "0" (ax)
279    );
280 #elif defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64)
281    __asm __volatile (
282      "cpuid\n\t"
283      : "=a" (p[0]),
284        "=b" (p[1]),
285        "=c" (p[2]),
286        "=d" (p[3])
287      : "0" (ax)
288    );
289 #elif defined(PIPE_CC_MSVC)
290    __cpuid(p, ax);
291 #else
292    p[0] = 0;
293    p[1] = 0;
294    p[2] = 0;
295    p[3] = 0;
296 #endif
297 }
298 
299 /**
300  * @sa cpuid.h included in gcc-4.4 onwards.
301  * @sa http://msdn.microsoft.com/en-us/library/hskdteyh%28v=vs.90%29.aspx
302  */
303 static inline void
cpuid_count(uint32_t ax,uint32_t cx,uint32_t * p)304 cpuid_count(uint32_t ax, uint32_t cx, uint32_t *p)
305 {
306 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86)
307    __asm __volatile (
308      "xchgl %%ebx, %1\n\t"
309      "cpuid\n\t"
310      "xchgl %%ebx, %1"
311      : "=a" (p[0]),
312        "=S" (p[1]),
313        "=c" (p[2]),
314        "=d" (p[3])
315      : "0" (ax), "2" (cx)
316    );
317 #elif defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64)
318    __asm __volatile (
319      "cpuid\n\t"
320      : "=a" (p[0]),
321        "=b" (p[1]),
322        "=c" (p[2]),
323        "=d" (p[3])
324      : "0" (ax), "2" (cx)
325    );
326 #elif defined(PIPE_CC_MSVC)
327    __cpuidex(p, ax, cx);
328 #else
329    p[0] = 0;
330    p[1] = 0;
331    p[2] = 0;
332    p[3] = 0;
333 #endif
334 }
335 
336 
xgetbv(void)337 static inline uint64_t xgetbv(void)
338 {
339 #if defined(PIPE_CC_GCC)
340    uint32_t eax, edx;
341 
342    __asm __volatile (
343      ".byte 0x0f, 0x01, 0xd0" // xgetbv isn't supported on gcc < 4.4
344      : "=a"(eax),
345        "=d"(edx)
346      : "c"(0)
347    );
348 
349    return ((uint64_t)edx << 32) | eax;
350 #elif defined(PIPE_CC_MSVC) && defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK)
351    return _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
352 #else
353    return 0;
354 #endif
355 }
356 
357 
358 #if defined(PIPE_ARCH_X86)
sse2_has_daz(void)359 PIPE_ALIGN_STACK static inline boolean sse2_has_daz(void)
360 {
361    struct {
362       uint32_t pad1[7];
363       uint32_t mxcsr_mask;
364       uint32_t pad2[128-8];
365    } PIPE_ALIGN_VAR(16) fxarea;
366 
367    fxarea.mxcsr_mask = 0;
368 #if defined(PIPE_CC_GCC)
369    __asm __volatile ("fxsave %0" : "+m" (fxarea));
370 #elif defined(PIPE_CC_MSVC) || defined(PIPE_CC_ICL)
371    _fxsave(&fxarea);
372 #else
373    fxarea.mxcsr_mask = 0;
374 #endif
375    return !!(fxarea.mxcsr_mask & (1 << 6));
376 }
377 #endif
378 
379 #endif /* X86 or X86_64 */
380 
381 #if defined(PIPE_ARCH_ARM)
382 static void
check_os_arm_support(void)383 check_os_arm_support(void)
384 {
385    /*
386     * On Android, the cpufeatures library is preferred way of checking
387     * CPU capabilities. However, it is not available for standalone Mesa
388     * builds, i.e. when Android build system (Android.mk-based) is not
389     * used. Because of this we cannot use PIPE_OS_ANDROID here, but rather
390     * have a separate macro that only gets enabled from respective Android.mk.
391     */
392 #if defined(__ARM_NEON) || defined(__ARM_NEON__)
393    util_cpu_caps.has_neon = 1;
394 #elif defined(PIPE_OS_FREEBSD) && defined(HAVE_ELF_AUX_INFO)
395    unsigned long hwcap = 0;
396    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
397    if (hwcap & HWCAP_NEON)
398       util_cpu_caps.has_neon = 1;
399 #elif defined(HAS_ANDROID_CPUFEATURES)
400    AndroidCpuFamily cpu_family = android_getCpuFamily();
401    uint64_t cpu_features = android_getCpuFeatures();
402 
403    if (cpu_family == ANDROID_CPU_FAMILY_ARM) {
404       if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON)
405          util_cpu_caps.has_neon = 1;
406    }
407 #elif defined(PIPE_OS_LINUX)
408     Elf32_auxv_t aux;
409     int fd;
410 
411     fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
412     if (fd >= 0) {
413        while (read(fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t)) {
414           if (aux.a_type == AT_HWCAP) {
415              uint32_t hwcap = aux.a_un.a_val;
416 
417              util_cpu_caps.has_neon = (hwcap >> 12) & 1;
418              break;
419           }
420        }
421        close (fd);
422     }
423 #endif /* PIPE_OS_LINUX */
424 }
425 
426 #elif defined(PIPE_ARCH_AARCH64)
427 static void
check_os_arm_support(void)428 check_os_arm_support(void)
429 {
430     util_cpu_caps.has_neon = true;
431 }
432 #endif /* PIPE_ARCH_ARM || PIPE_ARCH_AARCH64 */
433 
434 static void
get_cpu_topology(void)435 get_cpu_topology(void)
436 {
437    /* Default. This is OK if L3 is not present or there is only one. */
438    util_cpu_caps.cores_per_L3 = util_cpu_caps.nr_cpus;
439    util_cpu_caps.num_L3_caches = 1;
440 
441 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
442    /* AMD Zen */
443    if (util_cpu_caps.family >= CPU_AMD_ZEN1_ZEN2 &&
444        util_cpu_caps.family < CPU_AMD_LAST) {
445       uint32_t regs[4];
446 
447       /* Query the L3 cache count. */
448       cpuid_count(0x8000001D, 3, regs);
449       unsigned cache_level = (regs[0] >> 5) & 0x7;
450       unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1;
451 
452       if (cache_level != 3 || cores_per_L3 == util_cpu_caps.nr_cpus)
453          return;
454 
455       uint32_t saved_mask[UTIL_MAX_CPUS / 32] = {0};
456       uint32_t mask[UTIL_MAX_CPUS / 32] = {0};
457       uint32_t allowed_mask[UTIL_MAX_CPUS / 32] = {0};
458       uint32_t apic_id[UTIL_MAX_CPUS];
459       bool saved = false;
460 
461       /* Query APIC IDs from each CPU core.
462        *
463        * An APIC ID is a logical ID of the CPU with respect to the cache
464        * hierarchy, meaning that consecutive APIC IDs are neighbours in
465        * the hierarchy, e.g. sharing the same cache.
466        *
467        * For example, CPU 0 can have APIC ID 0 and CPU 12 can have APIC ID 1,
468        * which means that both CPU 0 and 12 are next to each other.
469        * (e.g. they are 2 threads belonging to 1 SMT2 core)
470        *
471        * We need to find out which CPUs share the same L3 cache and they can
472        * be all over the place.
473        *
474        * Querying the APIC ID can only be done by pinning the current thread
475        * to each core. The original affinity mask is saved.
476        */
477       for (unsigned i = 0; i < util_cpu_caps.nr_cpus && i < UTIL_MAX_CPUS;
478            i++) {
479          uint32_t cpu_bit = 1u << (i % 32);
480 
481          mask[i / 32] = cpu_bit;
482 
483          if (util_set_current_thread_affinity(mask,
484                                               !saved ? saved_mask : NULL,
485                                               UTIL_MAX_CPUS)) {
486             saved = true;
487             allowed_mask[i / 32] |= cpu_bit;
488 
489             /* Query the APIC ID of the current core. */
490             cpuid(0x00000001, regs);
491             apic_id[i] = regs[1] >> 24;
492          }
493          mask[i / 32] = 0;
494       }
495 
496       if (saved) {
497 
498          /* We succeeded in using at least one CPU. */
499          util_cpu_caps.num_L3_caches = util_cpu_caps.nr_cpus / cores_per_L3;
500          util_cpu_caps.cores_per_L3 = cores_per_L3;
501          util_cpu_caps.L3_affinity_mask = calloc(sizeof(util_affinity_mask),
502                                                  util_cpu_caps.num_L3_caches);
503 
504          for (unsigned i = 0; i < util_cpu_caps.nr_cpus && i < UTIL_MAX_CPUS;
505               i++) {
506             uint32_t cpu_bit = 1u << (i % 32);
507 
508             if (allowed_mask[i / 32] & cpu_bit) {
509                /* Each APIC ID bit represents a topology level, so we need
510                 * to round up to the next power of two.
511                 */
512                unsigned L3_index = apic_id[i] /
513                                    util_next_power_of_two(cores_per_L3);
514 
515                util_cpu_caps.L3_affinity_mask[L3_index][i / 32] |= cpu_bit;
516                util_cpu_caps.cpu_to_L3[i] = L3_index;
517             }
518          }
519 
520          if (debug_get_option_dump_cpu()) {
521             fprintf(stderr, "CPU <-> L3 cache mapping:\n");
522             for (unsigned i = 0; i < util_cpu_caps.num_L3_caches; i++) {
523                fprintf(stderr, "  - L3 %u mask = ", i);
524                for (int j = util_cpu_caps.nr_cpus - 1; j >= 0; j -= 32)
525                   fprintf(stderr, "%08x ", util_cpu_caps.L3_affinity_mask[i][j / 32]);
526                fprintf(stderr, "\n");
527             }
528          }
529 
530          /* Restore the original affinity mask. */
531          util_set_current_thread_affinity(saved_mask, NULL, UTIL_MAX_CPUS);
532       } else {
533          if (debug_get_option_dump_cpu())
534             fprintf(stderr, "Cannot set thread affinity for any thread.\n");
535       }
536    }
537 #endif
538 }
539 
540 static void
util_cpu_detect_once(void)541 util_cpu_detect_once(void)
542 {
543    memset(&util_cpu_caps, 0, sizeof util_cpu_caps);
544 
545    /* Count the number of CPUs in system */
546 #if defined(PIPE_OS_WINDOWS)
547    {
548       SYSTEM_INFO system_info;
549       GetSystemInfo(&system_info);
550       util_cpu_caps.nr_cpus = system_info.dwNumberOfProcessors;
551    }
552 #elif defined(PIPE_OS_UNIX) && defined(_SC_NPROCESSORS_ONLN)
553    util_cpu_caps.nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
554    if (util_cpu_caps.nr_cpus == ~0)
555       util_cpu_caps.nr_cpus = 1;
556 #elif defined(PIPE_OS_BSD)
557    {
558       int mib[2], ncpu;
559       int len;
560 
561       mib[0] = CTL_HW;
562       mib[1] = HW_NCPU;
563 
564       len = sizeof (ncpu);
565       sysctl(mib, 2, &ncpu, &len, NULL, 0);
566       util_cpu_caps.nr_cpus = ncpu;
567    }
568 #else
569    util_cpu_caps.nr_cpus = 1;
570 #endif
571 
572    /* Make the fallback cacheline size nonzero so that it can be
573     * safely passed to align().
574     */
575    util_cpu_caps.cacheline = sizeof(void *);
576 
577 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
578    if (has_cpuid()) {
579       uint32_t regs[4];
580       uint32_t regs2[4];
581 
582       util_cpu_caps.cacheline = 32;
583 
584       /* Get max cpuid level */
585       cpuid(0x00000000, regs);
586 
587       if (regs[0] >= 0x00000001) {
588          unsigned int cacheline;
589 
590          cpuid (0x00000001, regs2);
591 
592          util_cpu_caps.x86_cpu_type = (regs2[0] >> 8) & 0xf;
593          /* Add "extended family". */
594          if (util_cpu_caps.x86_cpu_type == 0xf)
595              util_cpu_caps.x86_cpu_type += ((regs2[0] >> 20) & 0xff);
596 
597          switch (util_cpu_caps.x86_cpu_type) {
598          case 0x17:
599             util_cpu_caps.family = CPU_AMD_ZEN1_ZEN2;
600             break;
601          case 0x18:
602             util_cpu_caps.family = CPU_AMD_ZEN_HYGON;
603             break;
604          case 0x19:
605             util_cpu_caps.family = CPU_AMD_ZEN3;
606             break;
607          }
608 
609          /* general feature flags */
610          util_cpu_caps.has_tsc    = (regs2[3] >>  4) & 1; /* 0x0000010 */
611          util_cpu_caps.has_mmx    = (regs2[3] >> 23) & 1; /* 0x0800000 */
612          util_cpu_caps.has_sse    = (regs2[3] >> 25) & 1; /* 0x2000000 */
613          util_cpu_caps.has_sse2   = (regs2[3] >> 26) & 1; /* 0x4000000 */
614          util_cpu_caps.has_sse3   = (regs2[2] >>  0) & 1; /* 0x0000001 */
615          util_cpu_caps.has_ssse3  = (regs2[2] >>  9) & 1; /* 0x0000020 */
616          util_cpu_caps.has_sse4_1 = (regs2[2] >> 19) & 1;
617          util_cpu_caps.has_sse4_2 = (regs2[2] >> 20) & 1;
618          util_cpu_caps.has_popcnt = (regs2[2] >> 23) & 1;
619          util_cpu_caps.has_avx    = ((regs2[2] >> 28) & 1) && // AVX
620                                     ((regs2[2] >> 27) & 1) && // OSXSAVE
621                                     ((xgetbv() & 6) == 6);    // XMM & YMM
622          util_cpu_caps.has_f16c   = ((regs2[2] >> 29) & 1) && util_cpu_caps.has_avx;
623          util_cpu_caps.has_fma    = ((regs2[2] >> 12) & 1) && util_cpu_caps.has_avx;
624          util_cpu_caps.has_mmx2   = util_cpu_caps.has_sse; /* SSE cpus supports mmxext too */
625 #if defined(PIPE_ARCH_X86_64)
626          util_cpu_caps.has_daz = 1;
627 #else
628          util_cpu_caps.has_daz = util_cpu_caps.has_sse3 ||
629             (util_cpu_caps.has_sse2 && sse2_has_daz());
630 #endif
631 
632          cacheline = ((regs2[1] >> 8) & 0xFF) * 8;
633          if (cacheline > 0)
634             util_cpu_caps.cacheline = cacheline;
635       }
636       if (util_cpu_caps.has_avx && regs[0] >= 0x00000007) {
637          uint32_t regs7[4];
638          cpuid_count(0x00000007, 0x00000000, regs7);
639          util_cpu_caps.has_avx2 = (regs7[1] >> 5) & 1;
640       }
641 
642       // check for avx512
643       if (((regs2[2] >> 27) & 1) && // OSXSAVE
644           (xgetbv() & (0x7 << 5)) && // OPMASK: upper-256 enabled by OS
645           ((xgetbv() & 6) == 6)) { // XMM/YMM enabled by OS
646          uint32_t regs3[4];
647          cpuid_count(0x00000007, 0x00000000, regs3);
648          util_cpu_caps.has_avx512f    = (regs3[1] >> 16) & 1;
649          util_cpu_caps.has_avx512dq   = (regs3[1] >> 17) & 1;
650          util_cpu_caps.has_avx512ifma = (regs3[1] >> 21) & 1;
651          util_cpu_caps.has_avx512pf   = (regs3[1] >> 26) & 1;
652          util_cpu_caps.has_avx512er   = (regs3[1] >> 27) & 1;
653          util_cpu_caps.has_avx512cd   = (regs3[1] >> 28) & 1;
654          util_cpu_caps.has_avx512bw   = (regs3[1] >> 30) & 1;
655          util_cpu_caps.has_avx512vl   = (regs3[1] >> 31) & 1;
656          util_cpu_caps.has_avx512vbmi = (regs3[2] >>  1) & 1;
657       }
658 
659       if (regs[1] == 0x756e6547 && regs[2] == 0x6c65746e && regs[3] == 0x49656e69) {
660          /* GenuineIntel */
661          util_cpu_caps.has_intel = 1;
662       }
663 
664       cpuid(0x80000000, regs);
665 
666       if (regs[0] >= 0x80000001) {
667 
668          cpuid(0x80000001, regs2);
669 
670          util_cpu_caps.has_mmx  |= (regs2[3] >> 23) & 1;
671          util_cpu_caps.has_mmx2 |= (regs2[3] >> 22) & 1;
672          util_cpu_caps.has_3dnow = (regs2[3] >> 31) & 1;
673          util_cpu_caps.has_3dnow_ext = (regs2[3] >> 30) & 1;
674 
675          util_cpu_caps.has_xop = util_cpu_caps.has_avx &&
676                                  ((regs2[2] >> 11) & 1);
677       }
678 
679       if (regs[0] >= 0x80000006) {
680          /* should we really do this if the clflush size above worked? */
681          unsigned int cacheline;
682          cpuid(0x80000006, regs2);
683          cacheline = regs2[2] & 0xFF;
684          if (cacheline > 0)
685             util_cpu_caps.cacheline = cacheline;
686       }
687 
688       if (!util_cpu_caps.has_sse) {
689          util_cpu_caps.has_sse2 = 0;
690          util_cpu_caps.has_sse3 = 0;
691          util_cpu_caps.has_ssse3 = 0;
692          util_cpu_caps.has_sse4_1 = 0;
693       }
694    }
695 #endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
696 
697 #if defined(PIPE_ARCH_ARM) || defined(PIPE_ARCH_AARCH64)
698    check_os_arm_support();
699 #endif
700 
701 #if defined(PIPE_ARCH_PPC)
702    check_os_altivec_support();
703 #endif /* PIPE_ARCH_PPC */
704 
705    get_cpu_topology();
706 
707    if (debug_get_option_dump_cpu()) {
708       debug_printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus);
709 
710       debug_printf("util_cpu_caps.x86_cpu_type = %u\n", util_cpu_caps.x86_cpu_type);
711       debug_printf("util_cpu_caps.cacheline = %u\n", util_cpu_caps.cacheline);
712 
713       debug_printf("util_cpu_caps.has_tsc = %u\n", util_cpu_caps.has_tsc);
714       debug_printf("util_cpu_caps.has_mmx = %u\n", util_cpu_caps.has_mmx);
715       debug_printf("util_cpu_caps.has_mmx2 = %u\n", util_cpu_caps.has_mmx2);
716       debug_printf("util_cpu_caps.has_sse = %u\n", util_cpu_caps.has_sse);
717       debug_printf("util_cpu_caps.has_sse2 = %u\n", util_cpu_caps.has_sse2);
718       debug_printf("util_cpu_caps.has_sse3 = %u\n", util_cpu_caps.has_sse3);
719       debug_printf("util_cpu_caps.has_ssse3 = %u\n", util_cpu_caps.has_ssse3);
720       debug_printf("util_cpu_caps.has_sse4_1 = %u\n", util_cpu_caps.has_sse4_1);
721       debug_printf("util_cpu_caps.has_sse4_2 = %u\n", util_cpu_caps.has_sse4_2);
722       debug_printf("util_cpu_caps.has_avx = %u\n", util_cpu_caps.has_avx);
723       debug_printf("util_cpu_caps.has_avx2 = %u\n", util_cpu_caps.has_avx2);
724       debug_printf("util_cpu_caps.has_f16c = %u\n", util_cpu_caps.has_f16c);
725       debug_printf("util_cpu_caps.has_popcnt = %u\n", util_cpu_caps.has_popcnt);
726       debug_printf("util_cpu_caps.has_3dnow = %u\n", util_cpu_caps.has_3dnow);
727       debug_printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext);
728       debug_printf("util_cpu_caps.has_xop = %u\n", util_cpu_caps.has_xop);
729       debug_printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec);
730       debug_printf("util_cpu_caps.has_vsx = %u\n", util_cpu_caps.has_vsx);
731       debug_printf("util_cpu_caps.has_neon = %u\n", util_cpu_caps.has_neon);
732       debug_printf("util_cpu_caps.has_daz = %u\n", util_cpu_caps.has_daz);
733       debug_printf("util_cpu_caps.has_avx512f = %u\n", util_cpu_caps.has_avx512f);
734       debug_printf("util_cpu_caps.has_avx512dq = %u\n", util_cpu_caps.has_avx512dq);
735       debug_printf("util_cpu_caps.has_avx512ifma = %u\n", util_cpu_caps.has_avx512ifma);
736       debug_printf("util_cpu_caps.has_avx512pf = %u\n", util_cpu_caps.has_avx512pf);
737       debug_printf("util_cpu_caps.has_avx512er = %u\n", util_cpu_caps.has_avx512er);
738       debug_printf("util_cpu_caps.has_avx512cd = %u\n", util_cpu_caps.has_avx512cd);
739       debug_printf("util_cpu_caps.has_avx512bw = %u\n", util_cpu_caps.has_avx512bw);
740       debug_printf("util_cpu_caps.has_avx512vl = %u\n", util_cpu_caps.has_avx512vl);
741       debug_printf("util_cpu_caps.has_avx512vbmi = %u\n", util_cpu_caps.has_avx512vbmi);
742    }
743 }
744 
745 static once_flag cpu_once_flag = ONCE_FLAG_INIT;
746 
747 void
util_cpu_detect(void)748 util_cpu_detect(void)
749 {
750    call_once(&cpu_once_flag, util_cpu_detect_once);
751 }
752