• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* CpuArch.c -- CPU specific code
2 2023-05-18 : Igor Pavlov : Public domain */
3 
4 #include "Precomp.h"
5 
6 // #include <stdio.h>
7 
8 #include "CpuArch.h"
9 
10 #ifdef MY_CPU_X86_OR_AMD64
11 
12 #undef NEED_CHECK_FOR_CPUID
13 #if !defined(MY_CPU_AMD64)
14 #define NEED_CHECK_FOR_CPUID
15 #endif
16 
17 /*
18   cpuid instruction supports (subFunction) parameter in ECX,
19   that is used only with some specific (function) parameter values.
20   But we always use only (subFunction==0).
21 */
22 /*
23   __cpuid(): MSVC and GCC/CLANG use same function/macro name
24              but parameters are different.
25    We use MSVC __cpuid() parameters style for our z7_x86_cpuid() function.
26 */
27 
28 #if defined(__GNUC__) /* && (__GNUC__ >= 10) */ \
29     || defined(__clang__) /* && (__clang_major__ >= 10) */
30 
31 /* there was some CLANG/GCC compilers that have issues with
32    rbx(ebx) handling in asm blocks in -fPIC mode (__PIC__ is defined).
33    compiler's <cpuid.h> contains the macro __cpuid() that is similar to our code.
34    The history of __cpuid() changes in CLANG/GCC:
35    GCC:
36      2007: it preserved ebx for (__PIC__ && __i386__)
37      2013: it preserved rbx and ebx for __PIC__
38      2014: it doesn't preserves rbx and ebx anymore
39      we suppose that (__GNUC__ >= 5) fixed that __PIC__ ebx/rbx problem.
40    CLANG:
41      2014+: it preserves rbx, but only for 64-bit code. No __PIC__ check.
42    Why CLANG cares about 64-bit mode only, and doesn't care about ebx (in 32-bit)?
43    Do we need __PIC__ test for CLANG or we must care about rbx even if
44    __PIC__ is not defined?
45 */
46 
47 #define ASM_LN "\n"
48 
49 #if defined(MY_CPU_AMD64) && defined(__PIC__) \
50     && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))
51 
52 #define x86_cpuid_MACRO(p, func) { \
53   __asm__ __volatile__ ( \
54     ASM_LN   "mov     %%rbx, %q1"  \
55     ASM_LN   "cpuid"               \
56     ASM_LN   "xchg    %%rbx, %q1"  \
57     : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); }
58 
59   /* "=&r" selects free register. It can select even rbx, if that register is free.
60      "=&D" for (RDI) also works, but the code can be larger with "=&D"
61      "2"(0) means (subFunction = 0),
62      2 is (zero-based) index in the output constraint list "=c" (ECX). */
63 
64 #elif defined(MY_CPU_X86) && defined(__PIC__) \
65     && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))
66 
67 #define x86_cpuid_MACRO(p, func) { \
68   __asm__ __volatile__ ( \
69     ASM_LN   "mov     %%ebx, %k1"  \
70     ASM_LN   "cpuid"               \
71     ASM_LN   "xchg    %%ebx, %k1"  \
72     : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); }
73 
74 #else
75 
76 #define x86_cpuid_MACRO(p, func) { \
77   __asm__ __volatile__ ( \
78     ASM_LN   "cpuid"               \
79     : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); }
80 
81 #endif
82 
83 
z7_x86_cpuid(UInt32 p[4],UInt32 func)84 void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
85 {
86   x86_cpuid_MACRO(p, func)
87 }
88 
89 
90 Z7_NO_INLINE
z7_x86_cpuid_GetMaxFunc(void)91 UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
92 {
93  #if defined(NEED_CHECK_FOR_CPUID)
94   #define EFALGS_CPUID_BIT 21
95   UInt32 a;
96   __asm__ __volatile__ (
97     ASM_LN   "pushf"
98     ASM_LN   "pushf"
99     ASM_LN   "pop     %0"
100     // ASM_LN   "movl    %0, %1"
101     // ASM_LN   "xorl    $0x200000, %0"
102     ASM_LN   "btc     %1, %0"
103     ASM_LN   "push    %0"
104     ASM_LN   "popf"
105     ASM_LN   "pushf"
106     ASM_LN   "pop     %0"
107     ASM_LN   "xorl    (%%esp), %0"
108 
109     ASM_LN   "popf"
110     ASM_LN
111     : "=&r" (a) // "=a"
112     : "i" (EFALGS_CPUID_BIT)
113     );
114   if ((a & (1 << EFALGS_CPUID_BIT)) == 0)
115     return 0;
116  #endif
117   {
118     UInt32 p[4];
119     x86_cpuid_MACRO(p, 0)
120     return p[0];
121   }
122 }
123 
124 #undef ASM_LN
125 
126 #elif !defined(_MSC_VER)
127 
128 /*
129 // for gcc/clang and other: we can try to use __cpuid macro:
130 #include <cpuid.h>
131 void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
132 {
133   __cpuid(func, p[0], p[1], p[2], p[3]);
134 }
135 UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
136 {
137   return (UInt32)__get_cpuid_max(0, NULL);
138 }
139 */
140 // for unsupported cpuid:
z7_x86_cpuid(UInt32 p[4],UInt32 func)141 void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
142 {
143   UNUSED_VAR(func)
144   p[0] = p[1] = p[2] = p[3] = 0;
145 }
z7_x86_cpuid_GetMaxFunc(void)146 UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
147 {
148   return 0;
149 }
150 
151 #else // _MSC_VER
152 
153 #if !defined(MY_CPU_AMD64)
154 
z7_x86_cpuid_GetMaxFunc(void)155 UInt32 __declspec(naked) Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
156 {
157   #if defined(NEED_CHECK_FOR_CPUID)
158   #define EFALGS_CPUID_BIT 21
159   __asm   pushfd
160   __asm   pushfd
161   /*
162   __asm   pop     eax
163   // __asm   mov     edx, eax
164   __asm   btc     eax, EFALGS_CPUID_BIT
165   __asm   push    eax
166   */
167   __asm   btc     dword ptr [esp], EFALGS_CPUID_BIT
168   __asm   popfd
169   __asm   pushfd
170   __asm   pop     eax
171   // __asm   xor     eax, edx
172   __asm   xor     eax, [esp]
173   // __asm   push    edx
174   __asm   popfd
175   __asm   and     eax, (1 shl EFALGS_CPUID_BIT)
176   __asm   jz end_func
177   #endif
178   __asm   push    ebx
179   __asm   xor     eax, eax    // func
180   __asm   xor     ecx, ecx    // subFunction (optional) for (func == 0)
181   __asm   cpuid
182   __asm   pop     ebx
183   #if defined(NEED_CHECK_FOR_CPUID)
184   end_func:
185   #endif
186   __asm   ret 0
187 }
188 
z7_x86_cpuid(UInt32 p[4],UInt32 func)189 void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
190 {
191   UNUSED_VAR(p)
192   UNUSED_VAR(func)
193   __asm   push    ebx
194   __asm   push    edi
195   __asm   mov     edi, ecx    // p
196   __asm   mov     eax, edx    // func
197   __asm   xor     ecx, ecx    // subfunction (optional) for (func == 0)
198   __asm   cpuid
199   __asm   mov     [edi     ], eax
200   __asm   mov     [edi +  4], ebx
201   __asm   mov     [edi +  8], ecx
202   __asm   mov     [edi + 12], edx
203   __asm   pop     edi
204   __asm   pop     ebx
205   __asm   ret     0
206 }
207 
208 #else // MY_CPU_AMD64
209 
210     #if _MSC_VER >= 1600
211       #include <intrin.h>
212       #define MY_cpuidex  __cpuidex
213     #else
214 /*
215  __cpuid (func == (0 or 7)) requires subfunction number in ECX.
216   MSDN: The __cpuid intrinsic clears the ECX register before calling the cpuid instruction.
217    __cpuid() in new MSVC clears ECX.
218    __cpuid() in old MSVC (14.00) x64 doesn't clear ECX
219  We still can use __cpuid for low (func) values that don't require ECX,
220  but __cpuid() in old MSVC will be incorrect for some func values: (func == 7).
221  So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction,
222  where ECX value is first parameter for FASTCALL / NO_INLINE func,
223  So the caller of MY_cpuidex_HACK() sets ECX as subFunction, and
224  old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value.
225 
226 DON'T remove Z7_NO_INLINE and Z7_FASTCALL for MY_cpuidex_HACK(): !!!
227 */
228 static
MY_cpuidex_HACK(UInt32 subFunction,UInt32 func,int * CPUInfo)229 Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(UInt32 subFunction, UInt32 func, int *CPUInfo)
230 {
231   UNUSED_VAR(subFunction)
232   __cpuid(CPUInfo, func);
233 }
234       #define MY_cpuidex(info, func, func2)  MY_cpuidex_HACK(func2, func, info)
235       #pragma message("======== MY_cpuidex_HACK WAS USED ========")
236     #endif // _MSC_VER >= 1600
237 
238 #if !defined(MY_CPU_AMD64)
239 /* inlining for __cpuid() in MSVC x86 (32-bit) produces big ineffective code,
240    so we disable inlining here */
241 Z7_NO_INLINE
242 #endif
z7_x86_cpuid(UInt32 p[4],UInt32 func)243 void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
244 {
245   MY_cpuidex((int *)p, (int)func, 0);
246 }
247 
248 Z7_NO_INLINE
z7_x86_cpuid_GetMaxFunc(void)249 UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
250 {
251   int a[4];
252   MY_cpuidex(a, 0, 0);
253   return a[0];
254 }
255 
256 #endif // MY_CPU_AMD64
257 #endif // _MSC_VER
258 
259 #if defined(NEED_CHECK_FOR_CPUID)
260 #define CHECK_CPUID_IS_SUPPORTED { if (z7_x86_cpuid_GetMaxFunc() == 0) return 0; }
261 #else
262 #define CHECK_CPUID_IS_SUPPORTED
263 #endif
264 #undef NEED_CHECK_FOR_CPUID
265 
266 
267 static
x86cpuid_Func_1(UInt32 * p)268 BoolInt x86cpuid_Func_1(UInt32 *p)
269 {
270   CHECK_CPUID_IS_SUPPORTED
271   z7_x86_cpuid(p, 1);
272   return True;
273 }
274 
275 /*
276 static const UInt32 kVendors[][1] =
277 {
278   { 0x756E6547 }, // , 0x49656E69, 0x6C65746E },
279   { 0x68747541 }, // , 0x69746E65, 0x444D4163 },
280   { 0x746E6543 }  // , 0x48727561, 0x736C7561 }
281 };
282 */
283 
284 /*
285 typedef struct
286 {
287   UInt32 maxFunc;
288   UInt32 vendor[3];
289   UInt32 ver;
290   UInt32 b;
291   UInt32 c;
292   UInt32 d;
293 } Cx86cpuid;
294 
295 enum
296 {
297   CPU_FIRM_INTEL,
298   CPU_FIRM_AMD,
299   CPU_FIRM_VIA
300 };
301 int x86cpuid_GetFirm(const Cx86cpuid *p);
302 #define x86cpuid_ver_GetFamily(ver) (((ver >> 16) & 0xff0) | ((ver >> 8) & 0xf))
303 #define x86cpuid_ver_GetModel(ver)  (((ver >> 12) &  0xf0) | ((ver >> 4) & 0xf))
304 #define x86cpuid_ver_GetStepping(ver) (ver & 0xf)
305 
306 int x86cpuid_GetFirm(const Cx86cpuid *p)
307 {
308   unsigned i;
309   for (i = 0; i < sizeof(kVendors) / sizeof(kVendors[0]); i++)
310   {
311     const UInt32 *v = kVendors[i];
312     if (v[0] == p->vendor[0]
313         // && v[1] == p->vendor[1]
314         // && v[2] == p->vendor[2]
315         )
316       return (int)i;
317   }
318   return -1;
319 }
320 
321 BoolInt CPU_Is_InOrder()
322 {
323   Cx86cpuid p;
324   UInt32 family, model;
325   if (!x86cpuid_CheckAndRead(&p))
326     return True;
327 
328   family = x86cpuid_ver_GetFamily(p.ver);
329   model = x86cpuid_ver_GetModel(p.ver);
330 
331   switch (x86cpuid_GetFirm(&p))
332   {
333     case CPU_FIRM_INTEL: return (family < 6 || (family == 6 && (
334         // In-Order Atom CPU
335            model == 0x1C  // 45 nm, N4xx, D4xx, N5xx, D5xx, 230, 330
336         || model == 0x26  // 45 nm, Z6xx
337         || model == 0x27  // 32 nm, Z2460
338         || model == 0x35  // 32 nm, Z2760
339         || model == 0x36  // 32 nm, N2xxx, D2xxx
340         )));
341     case CPU_FIRM_AMD: return (family < 5 || (family == 5 && (model < 6 || model == 0xA)));
342     case CPU_FIRM_VIA: return (family < 6 || (family == 6 && model < 0xF));
343   }
344   return False; // v23 : unknown processors are not In-Order
345 }
346 */
347 
348 #ifdef _WIN32
349 #include "7zWindows.h"
350 #endif
351 
352 #if !defined(MY_CPU_AMD64) && defined(_WIN32)
353 
354 /* for legacy SSE ia32: there is no user-space cpu instruction to check
355    that OS supports SSE register storing/restoring on context switches.
356    So we need some OS-specific function to check that it's safe to use SSE registers.
357 */
358 
359 Z7_FORCE_INLINE
CPU_Sys_Is_SSE_Supported(void)360 static BoolInt CPU_Sys_Is_SSE_Supported(void)
361 {
362 #ifdef _MSC_VER
363   #pragma warning(push)
364   #pragma warning(disable : 4996) // `GetVersion': was declared deprecated
365 #endif
366   /* low byte is major version of Windows
367      We suppose that any Windows version since
368      Windows2000 (major == 5) supports SSE registers */
369   return (Byte)GetVersion() >= 5;
370 #if defined(_MSC_VER)
371   #pragma warning(pop)
372 #endif
373 }
374 #define CHECK_SYS_SSE_SUPPORT if (!CPU_Sys_Is_SSE_Supported()) return False;
375 #else
376 #define CHECK_SYS_SSE_SUPPORT
377 #endif
378 
379 
380 #if !defined(MY_CPU_AMD64)
381 
CPU_IsSupported_CMOV(void)382 BoolInt CPU_IsSupported_CMOV(void)
383 {
384   UInt32 a[4];
385   if (!x86cpuid_Func_1(&a[0]))
386     return 0;
387   return (a[3] >> 15) & 1;
388 }
389 
CPU_IsSupported_SSE(void)390 BoolInt CPU_IsSupported_SSE(void)
391 {
392   UInt32 a[4];
393   CHECK_SYS_SSE_SUPPORT
394   if (!x86cpuid_Func_1(&a[0]))
395     return 0;
396   return (a[3] >> 25) & 1;
397 }
398 
CPU_IsSupported_SSE2(void)399 BoolInt CPU_IsSupported_SSE2(void)
400 {
401   UInt32 a[4];
402   CHECK_SYS_SSE_SUPPORT
403   if (!x86cpuid_Func_1(&a[0]))
404     return 0;
405   return (a[3] >> 26) & 1;
406 }
407 
408 #endif
409 
410 
x86cpuid_Func_1_ECX(void)411 static UInt32 x86cpuid_Func_1_ECX(void)
412 {
413   UInt32 a[4];
414   CHECK_SYS_SSE_SUPPORT
415   if (!x86cpuid_Func_1(&a[0]))
416     return 0;
417   return a[2];
418 }
419 
CPU_IsSupported_AES(void)420 BoolInt CPU_IsSupported_AES(void)
421 {
422   return (x86cpuid_Func_1_ECX() >> 25) & 1;
423 }
424 
CPU_IsSupported_SSSE3(void)425 BoolInt CPU_IsSupported_SSSE3(void)
426 {
427   return (x86cpuid_Func_1_ECX() >> 9) & 1;
428 }
429 
CPU_IsSupported_SSE41(void)430 BoolInt CPU_IsSupported_SSE41(void)
431 {
432   return (x86cpuid_Func_1_ECX() >> 19) & 1;
433 }
434 
CPU_IsSupported_SHA(void)435 BoolInt CPU_IsSupported_SHA(void)
436 {
437   CHECK_SYS_SSE_SUPPORT
438 
439   if (z7_x86_cpuid_GetMaxFunc() < 7)
440     return False;
441   {
442     UInt32 d[4];
443     z7_x86_cpuid(d, 7);
444     return (d[1] >> 29) & 1;
445   }
446 }
447 
448 /*
449 MSVC: _xgetbv() intrinsic is available since VS2010SP1.
450    MSVC also defines (_XCR_XFEATURE_ENABLED_MASK) macro in
451    <immintrin.h> that we can use or check.
452    For any 32-bit x86 we can use asm code in MSVC,
453    but MSVC asm code is huge after compilation.
454    So _xgetbv() is better
455 
456 ICC: _xgetbv() intrinsic is available (in what version of ICC?)
457    ICC defines (__GNUC___) and it supports gnu assembler
458    also ICC supports MASM style code with -use-msasm switch.
459    but ICC doesn't support __attribute__((__target__))
460 
461 GCC/CLANG 9:
462   _xgetbv() is macro that works via __builtin_ia32_xgetbv()
463   and we need __attribute__((__target__("xsave")).
464   But with __target__("xsave") the function will be not
465   inlined to function that has no __target__("xsave") attribute.
466   If we want _xgetbv() call inlining, then we should use asm version
467   instead of calling _xgetbv().
468   Note:intrinsic is broke before GCC 8.2:
469     https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85684
470 */
471 
472 #if    defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1100) \
473     || defined(_MSC_VER) && (_MSC_VER >= 1600) && (_MSC_FULL_VER >= 160040219)  \
474     || defined(__GNUC__) && (__GNUC__ >= 9) \
475     || defined(__clang__) && (__clang_major__ >= 9)
476 // we define ATTRIB_XGETBV, if we want to use predefined _xgetbv() from compiler
477 #if defined(__INTEL_COMPILER)
478 #define ATTRIB_XGETBV
479 #elif defined(__GNUC__) || defined(__clang__)
480 // we don't define ATTRIB_XGETBV here, because asm version is better for inlining.
481 // #define ATTRIB_XGETBV __attribute__((__target__("xsave")))
482 #else
483 #define ATTRIB_XGETBV
484 #endif
485 #endif
486 
487 #if defined(ATTRIB_XGETBV)
488 #include <immintrin.h>
489 #endif
490 
491 
492 // XFEATURE_ENABLED_MASK/XCR0
493 #define MY_XCR_XFEATURE_ENABLED_MASK 0
494 
495 #if defined(ATTRIB_XGETBV)
496 ATTRIB_XGETBV
497 #endif
x86_xgetbv_0(UInt32 num)498 static UInt64 x86_xgetbv_0(UInt32 num)
499 {
500 #if defined(ATTRIB_XGETBV)
501   {
502     return
503       #if (defined(_MSC_VER))
504         _xgetbv(num);
505       #else
506         __builtin_ia32_xgetbv(
507           #if !defined(__clang__)
508             (int)
509           #endif
510             num);
511       #endif
512   }
513 
514 #elif defined(__GNUC__) || defined(__clang__) || defined(__SUNPRO_CC)
515 
516   UInt32 a, d;
517  #if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4))
518   __asm__
519   (
520     "xgetbv"
521     : "=a"(a), "=d"(d) : "c"(num) : "cc"
522   );
523  #else // is old gcc
524   __asm__
525   (
526     ".byte 0x0f, 0x01, 0xd0" "\n\t"
527     : "=a"(a), "=d"(d) : "c"(num) : "cc"
528   );
529  #endif
530   return ((UInt64)d << 32) | a;
531   // return a;
532 
533 #elif defined(_MSC_VER) && !defined(MY_CPU_AMD64)
534 
535   UInt32 a, d;
536   __asm {
537     push eax
538     push edx
539     push ecx
540     mov ecx, num;
541     // xor ecx, ecx // = MY_XCR_XFEATURE_ENABLED_MASK
542     _emit 0x0f
543     _emit 0x01
544     _emit 0xd0
545     mov a, eax
546     mov d, edx
547     pop ecx
548     pop edx
549     pop eax
550   }
551   return ((UInt64)d << 32) | a;
552   // return a;
553 
554 #else // it's unknown compiler
555   // #error "Need xgetbv function"
556   UNUSED_VAR(num)
557   // for MSVC-X64 we could call external function from external file.
558   /* Actually we had checked OSXSAVE/AVX in cpuid before.
559      So it's expected that OS supports at least AVX and below. */
560   // if (num != MY_XCR_XFEATURE_ENABLED_MASK) return 0; // if not XCR0
561   return
562       // (1 << 0) |  // x87
563         (1 << 1)   // SSE
564       | (1 << 2);  // AVX
565 
566 #endif
567 }
568 
569 #ifdef _WIN32
570 /*
571   Windows versions do not know about new ISA extensions that
572   can be introduced. But we still can use new extensions,
573   even if Windows doesn't report about supporting them,
574   But we can use new extensions, only if Windows knows about new ISA extension
575   that changes the number or size of registers: SSE, AVX/XSAVE, AVX512
576   So it's enough to check
577     MY_PF_AVX_INSTRUCTIONS_AVAILABLE
578       instead of
579     MY_PF_AVX2_INSTRUCTIONS_AVAILABLE
580 */
581 #define MY_PF_XSAVE_ENABLED                            17
582 // #define MY_PF_SSSE3_INSTRUCTIONS_AVAILABLE             36
583 // #define MY_PF_SSE4_1_INSTRUCTIONS_AVAILABLE            37
584 // #define MY_PF_SSE4_2_INSTRUCTIONS_AVAILABLE            38
585 // #define MY_PF_AVX_INSTRUCTIONS_AVAILABLE               39
586 // #define MY_PF_AVX2_INSTRUCTIONS_AVAILABLE              40
587 // #define MY_PF_AVX512F_INSTRUCTIONS_AVAILABLE           41
588 #endif
589 
CPU_IsSupported_AVX(void)590 BoolInt CPU_IsSupported_AVX(void)
591 {
592   #ifdef _WIN32
593   if (!IsProcessorFeaturePresent(MY_PF_XSAVE_ENABLED))
594     return False;
595   /* PF_AVX_INSTRUCTIONS_AVAILABLE probably is supported starting from
596      some latest Win10 revisions. But we need AVX in older Windows also.
597      So we don't use the following check: */
598   /*
599   if (!IsProcessorFeaturePresent(MY_PF_AVX_INSTRUCTIONS_AVAILABLE))
600     return False;
601   */
602   #endif
603 
604   /*
605     OS must use new special XSAVE/XRSTOR instructions to save
606     AVX registers when it required for context switching.
607     At OS statring:
608       OS sets CR4.OSXSAVE flag to signal the processor that OS supports the XSAVE extensions.
609       Also OS sets bitmask in XCR0 register that defines what
610       registers will be processed by XSAVE instruction:
611         XCR0.SSE[bit 0] - x87 registers and state
612         XCR0.SSE[bit 1] - SSE registers and state
613         XCR0.AVX[bit 2] - AVX registers and state
614     CR4.OSXSAVE is reflected to CPUID.1:ECX.OSXSAVE[bit 27].
615        So we can read that bit in user-space.
616     XCR0 is available for reading in user-space by new XGETBV instruction.
617   */
618   {
619     const UInt32 c = x86cpuid_Func_1_ECX();
620     if (0 == (1
621         & (c >> 28)   // AVX instructions are supported by hardware
622         & (c >> 27))) // OSXSAVE bit: XSAVE and related instructions are enabled by OS.
623       return False;
624   }
625 
626   /* also we can check
627      CPUID.1:ECX.XSAVE [bit 26] : that shows that
628         XSAVE, XRESTOR, XSETBV, XGETBV instructions are supported by hardware.
629      But that check is redundant, because if OSXSAVE bit is set, then XSAVE is also set */
630 
631   /* If OS have enabled XSAVE extension instructions (OSXSAVE == 1),
632      in most cases we expect that OS also will support storing/restoring
633      for AVX and SSE states at least.
634      But to be ensure for that we call user-space instruction
635      XGETBV(0) to get XCR0 value that contains bitmask that defines
636      what exact states(registers) OS have enabled for storing/restoring.
637   */
638 
639   {
640     const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK);
641     // printf("\n=== XGetBV=%d\n", bm);
642     return 1
643         & (bm >> 1)  // SSE state is supported (set by OS) for storing/restoring
644         & (bm >> 2); // AVX state is supported (set by OS) for storing/restoring
645   }
646   // since Win7SP1: we can use GetEnabledXStateFeatures();
647 }
648 
649 
CPU_IsSupported_AVX2(void)650 BoolInt CPU_IsSupported_AVX2(void)
651 {
652   if (!CPU_IsSupported_AVX())
653     return False;
654   if (z7_x86_cpuid_GetMaxFunc() < 7)
655     return False;
656   {
657     UInt32 d[4];
658     z7_x86_cpuid(d, 7);
659     // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]);
660     return 1
661       & (d[1] >> 5); // avx2
662   }
663 }
664 
CPU_IsSupported_VAES_AVX2(void)665 BoolInt CPU_IsSupported_VAES_AVX2(void)
666 {
667   if (!CPU_IsSupported_AVX())
668     return False;
669   if (z7_x86_cpuid_GetMaxFunc() < 7)
670     return False;
671   {
672     UInt32 d[4];
673     z7_x86_cpuid(d, 7);
674     // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]);
675     return 1
676       & (d[1] >> 5) // avx2
677       // & (d[1] >> 31) // avx512vl
678       & (d[2] >> 9); // vaes // VEX-256/EVEX
679   }
680 }
681 
CPU_IsSupported_PageGB(void)682 BoolInt CPU_IsSupported_PageGB(void)
683 {
684   CHECK_CPUID_IS_SUPPORTED
685   {
686     UInt32 d[4];
687     z7_x86_cpuid(d, 0x80000000);
688     if (d[0] < 0x80000001)
689       return False;
690     z7_x86_cpuid(d, 0x80000001);
691     return (d[3] >> 26) & 1;
692   }
693 }
694 
695 
696 #elif defined(MY_CPU_ARM_OR_ARM64)
697 
698 #ifdef _WIN32
699 
700 #include "7zWindows.h"
701 
CPU_IsSupported_CRC32(void)702 BoolInt CPU_IsSupported_CRC32(void)  { return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) ? 1 : 0; }
CPU_IsSupported_CRYPTO(void)703 BoolInt CPU_IsSupported_CRYPTO(void) { return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) ? 1 : 0; }
CPU_IsSupported_NEON(void)704 BoolInt CPU_IsSupported_NEON(void)   { return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) ? 1 : 0; }
705 
706 #else
707 
708 #if defined(__APPLE__)
709 
710 /*
711 #include <stdio.h>
712 #include <string.h>
713 static void Print_sysctlbyname(const char *name)
714 {
715   size_t bufSize = 256;
716   char buf[256];
717   int res = sysctlbyname(name, &buf, &bufSize, NULL, 0);
718   {
719     int i;
720     printf("\nres = %d : %s : '%s' : bufSize = %d, numeric", res, name, buf, (unsigned)bufSize);
721     for (i = 0; i < 20; i++)
722       printf(" %2x", (unsigned)(Byte)buf[i]);
723 
724   }
725 }
726 */
727 /*
728   Print_sysctlbyname("hw.pagesize");
729   Print_sysctlbyname("machdep.cpu.brand_string");
730 */
731 
z7_sysctlbyname_Get_BoolInt(const char * name)732 static BoolInt z7_sysctlbyname_Get_BoolInt(const char *name)
733 {
734   UInt32 val = 0;
735   if (z7_sysctlbyname_Get_UInt32(name, &val) == 0 && val == 1)
736     return 1;
737   return 0;
738 }
739 
CPU_IsSupported_CRC32(void)740 BoolInt CPU_IsSupported_CRC32(void)
741 {
742   return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_crc32");
743 }
744 
CPU_IsSupported_NEON(void)745 BoolInt CPU_IsSupported_NEON(void)
746 {
747   return z7_sysctlbyname_Get_BoolInt("hw.optional.neon");
748 }
749 
750 #ifdef MY_CPU_ARM64
751 #define APPLE_CRYPTO_SUPPORT_VAL 1
752 #else
753 #define APPLE_CRYPTO_SUPPORT_VAL 0
754 #endif
755 
CPU_IsSupported_SHA1(void)756 BoolInt CPU_IsSupported_SHA1(void) { return APPLE_CRYPTO_SUPPORT_VAL; }
CPU_IsSupported_SHA2(void)757 BoolInt CPU_IsSupported_SHA2(void) { return APPLE_CRYPTO_SUPPORT_VAL; }
CPU_IsSupported_AES(void)758 BoolInt CPU_IsSupported_AES (void) { return APPLE_CRYPTO_SUPPORT_VAL; }
759 
760 
761 #else // __APPLE__
762 
763 #include <sys/auxv.h>
764 
765 #define USE_HWCAP
766 
767 #ifdef USE_HWCAP
768 
769 #include <asm/hwcap.h>
770 
771   #define MY_HWCAP_CHECK_FUNC_2(name1, name2) \
772   BoolInt CPU_IsSupported_ ## name1() { return (getauxval(AT_HWCAP)  & (HWCAP_  ## name2)) ? 1 : 0; }
773 
774 #ifdef MY_CPU_ARM64
775   #define MY_HWCAP_CHECK_FUNC(name) \
776   MY_HWCAP_CHECK_FUNC_2(name, name)
MY_HWCAP_CHECK_FUNC_2(NEON,ASIMD)777   MY_HWCAP_CHECK_FUNC_2(NEON, ASIMD)
778 // MY_HWCAP_CHECK_FUNC (ASIMD)
779 #elif defined(MY_CPU_ARM)
780   #define MY_HWCAP_CHECK_FUNC(name) \
781   BoolInt CPU_IsSupported_ ## name() { return (getauxval(AT_HWCAP2) & (HWCAP2_ ## name)) ? 1 : 0; }
782   MY_HWCAP_CHECK_FUNC_2(NEON, NEON)
783 #endif
784 
785 #else // USE_HWCAP
786 
787   #define MY_HWCAP_CHECK_FUNC(name) \
788   BoolInt CPU_IsSupported_ ## name() { return 0; }
789   MY_HWCAP_CHECK_FUNC(NEON)
790 
791 #endif // USE_HWCAP
792 
793 MY_HWCAP_CHECK_FUNC (CRC32)
794 MY_HWCAP_CHECK_FUNC (SHA1)
795 MY_HWCAP_CHECK_FUNC (SHA2)
796 MY_HWCAP_CHECK_FUNC (AES)
797 
798 #endif // __APPLE__
799 #endif // _WIN32
800 
801 #endif // MY_CPU_ARM_OR_ARM64
802 
803 
804 
805 #ifdef __APPLE__
806 
807 #include <sys/sysctl.h>
808 
809 int z7_sysctlbyname_Get(const char *name, void *buf, size_t *bufSize)
810 {
811   return sysctlbyname(name, buf, bufSize, NULL, 0);
812 }
813 
z7_sysctlbyname_Get_UInt32(const char * name,UInt32 * val)814 int z7_sysctlbyname_Get_UInt32(const char *name, UInt32 *val)
815 {
816   size_t bufSize = sizeof(*val);
817   const int res = z7_sysctlbyname_Get(name, val, &bufSize);
818   if (res == 0 && bufSize != sizeof(*val))
819     return EFAULT;
820   return res;
821 }
822 
823 #endif
824