• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #pragma once
7 
8 
9 #ifdef __AVX512F__
10 #include <immintrin.h>
11 
12 // GCC pre-7, Clang pre-8, Apple Clang pre-10, and ICC pre-18
13 #if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 7)) || \
14     (defined(__clang__) && !defined(__apple_build_version__) && (__clang_major__ < 8)) || \
15     (defined(__clang__) && defined(__apple_build_version__) && (__apple_build_version__ < 10000000)) || \
16     (defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800))
17 
18 static inline __mmask16 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_cvtu32_mask16(unsigned int mask)19 _cvtu32_mask16(unsigned int mask) {
20   return (__mmask16) mask;
21 }
22 
23 #endif  // GCC pre-7, Clang pre-8, Apple Clang pre-10, and ICC pre-18
24 
25 // GCC pre-7, Clang pre-4, and ICC pre-18
26 #if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 7)) || \
27     (defined(__clang__) && (__clang_major__ < 4)) || \
28     (defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800))
29 
30 static inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_add_ps(__m512 v)31 _mm512_reduce_add_ps(__m512 v) {
32 #if __AVX512DQ__
33   const __m256 sum2 = _mm256_add_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1));
34 #else
35   const __m256 sum2 = _mm256_add_ps(_mm512_castps512_ps256(v), _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1)));
36 #endif
37   const __m128 sum4 = _mm_add_ps(_mm256_castps256_ps128(sum2), _mm256_extractf128_ps(sum2, 1));
38   const __m128 sum8 = _mm_add_ps(sum4, _mm_movehl_ps(sum4, sum4));
39   const __m128 sum16 = _mm_add_ss(sum8, _mm_movehdup_ps(sum8));
40   return _mm_cvtss_f32(sum16);
41 }
42 
43 static inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_max_ps(__m512 v)44 _mm512_reduce_max_ps(__m512 v) {
45 #if __AVX512DQ__
46   const __m256 sum2 = _mm256_max_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1));
47 #else
48   const __m256 sum2 = _mm256_max_ps(_mm512_castps512_ps256(v), _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1)));
49 #endif
50   const __m128 sum4 = _mm_max_ps(_mm256_castps256_ps128(sum2), _mm256_extractf128_ps(sum2, 1));
51   const __m128 sum8 = _mm_max_ps(sum4, _mm_movehl_ps(sum4, sum4));
52   const __m128 sum16 = _mm_max_ss(sum8, _mm_movehdup_ps(sum8));
53   return _mm_cvtss_f32(sum16);
54 }
55 
56 #endif  // GCC pre-7, Clang pre-4, and ICC pre-18
57 
58 #endif  // __AVX512F__
59