• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_
12 #define VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_
13 
14 #include <immintrin.h>  // AVX2
15 
16 #include "./vpx_config.h"
17 
18 #if defined(__clang__)
19 #if (__clang_major__ > 0 && __clang_major__ < 3) ||            \
20     (__clang_major__ == 3 && __clang_minor__ <= 3) ||          \
21     (defined(__APPLE__) && defined(__apple_build_version__) && \
22      ((__clang_major__ == 4 && __clang_minor__ <= 2) ||        \
23       (__clang_major__ == 5 && __clang_minor__ == 0)))
24 #define MM256_BROADCASTSI128_SI256(x) \
25   _mm_broadcastsi128_si256((__m128i const *)&(x))
26 #else  // clang > 3.3, and not 5.0 on macosx.
27 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
28 #endif  // clang <= 3.3
29 #elif defined(__GNUC__)
30 #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
31 #define MM256_BROADCASTSI128_SI256(x) \
32   _mm_broadcastsi128_si256((__m128i const *)&(x))
33 #elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
34 #define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
35 #else  // gcc > 4.7
36 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
37 #endif  // gcc <= 4.6
38 #else   // !(gcc || clang)
39 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
40 #endif  // __clang__
41 
shuffle_filter_avx2(const int16_t * const filter,__m256i * const f)42 static INLINE void shuffle_filter_avx2(const int16_t *const filter,
43                                        __m256i *const f) {
44   const __m256i f_values =
45       MM256_BROADCASTSI128_SI256(_mm_load_si128((const __m128i *)filter));
46   // pack and duplicate the filter values
47   f[0] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0200u));
48   f[1] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0604u));
49   f[2] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0a08u));
50   f[3] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0e0cu));
51 }
52 
convolve8_16_avx2(const __m256i * const s,const __m256i * const f)53 static INLINE __m256i convolve8_16_avx2(const __m256i *const s,
54                                         const __m256i *const f) {
55   // multiply 2 adjacent elements with the filter and add the result
56   const __m256i k_64 = _mm256_set1_epi16(1 << 6);
57   const __m256i x0 = _mm256_maddubs_epi16(s[0], f[0]);
58   const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]);
59   const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]);
60   const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]);
61   __m256i sum1, sum2;
62 
63   // sum the results together, saturating only on the final step
64   // adding x0 with x2 and x1 with x3 is the only order that prevents
65   // outranges for all filters
66   sum1 = _mm256_add_epi16(x0, x2);
67   sum2 = _mm256_add_epi16(x1, x3);
68   // add the rounding offset early to avoid another saturated add
69   sum1 = _mm256_add_epi16(sum1, k_64);
70   sum1 = _mm256_adds_epi16(sum1, sum2);
71   // round and shift by 7 bit each 16 bit
72   sum1 = _mm256_srai_epi16(sum1, 7);
73   return sum1;
74 }
75 
convolve8_8_avx2(const __m256i * const s,const __m256i * const f)76 static INLINE __m128i convolve8_8_avx2(const __m256i *const s,
77                                        const __m256i *const f) {
78   // multiply 2 adjacent elements with the filter and add the result
79   const __m128i k_64 = _mm_set1_epi16(1 << 6);
80   const __m128i x0 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[0]),
81                                        _mm256_castsi256_si128(f[0]));
82   const __m128i x1 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[1]),
83                                        _mm256_castsi256_si128(f[1]));
84   const __m128i x2 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[2]),
85                                        _mm256_castsi256_si128(f[2]));
86   const __m128i x3 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[3]),
87                                        _mm256_castsi256_si128(f[3]));
88   __m128i sum1, sum2;
89 
90   // sum the results together, saturating only on the final step
91   // adding x0 with x2 and x1 with x3 is the only order that prevents
92   // outranges for all filters
93   sum1 = _mm_add_epi16(x0, x2);
94   sum2 = _mm_add_epi16(x1, x3);
95   // add the rounding offset early to avoid another saturated add
96   sum1 = _mm_add_epi16(sum1, k_64);
97   sum1 = _mm_adds_epi16(sum1, sum2);
98   // shift by 7 bit each 16 bit
99   sum1 = _mm_srai_epi16(sum1, 7);
100   return sum1;
101 }
102 
mm256_loadu2_si128(const void * lo,const void * hi)103 static INLINE __m256i mm256_loadu2_si128(const void *lo, const void *hi) {
104   const __m256i tmp =
105       _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)lo));
106   return _mm256_inserti128_si256(tmp, _mm_loadu_si128((const __m128i *)hi), 1);
107 }
108 
mm256_loadu2_epi64(const void * lo,const void * hi)109 static INLINE __m256i mm256_loadu2_epi64(const void *lo, const void *hi) {
110   const __m256i tmp =
111       _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)lo));
112   return _mm256_inserti128_si256(tmp, _mm_loadl_epi64((const __m128i *)hi), 1);
113 }
114 
mm256_store2_si128(__m128i * const dst_ptr_1,__m128i * const dst_ptr_2,const __m256i * const src)115 static INLINE void mm256_store2_si128(__m128i *const dst_ptr_1,
116                                       __m128i *const dst_ptr_2,
117                                       const __m256i *const src) {
118   _mm_store_si128(dst_ptr_1, _mm256_castsi256_si128(*src));
119   _mm_store_si128(dst_ptr_2, _mm256_extractf128_si256(*src, 1));
120 }
121 
mm256_storeu2_epi64(__m128i * const dst_ptr_1,__m128i * const dst_ptr_2,const __m256i * const src)122 static INLINE void mm256_storeu2_epi64(__m128i *const dst_ptr_1,
123                                        __m128i *const dst_ptr_2,
124                                        const __m256i *const src) {
125   _mm_storel_epi64(dst_ptr_1, _mm256_castsi256_si128(*src));
126   _mm_storel_epi64(dst_ptr_2, _mm256_extractf128_si256(*src, 1));
127 }
128 
mm256_storeu2_epi32(__m128i * const dst_ptr_1,__m128i * const dst_ptr_2,const __m256i * const src)129 static INLINE void mm256_storeu2_epi32(__m128i *const dst_ptr_1,
130                                        __m128i *const dst_ptr_2,
131                                        const __m256i *const src) {
132   *((int *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src));
133   *((int *)(dst_ptr_2)) = _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1));
134 }
135 
mm256_round_epi32(const __m256i * const src,const __m256i * const half_depth,const int depth)136 static INLINE __m256i mm256_round_epi32(const __m256i *const src,
137                                         const __m256i *const half_depth,
138                                         const int depth) {
139   const __m256i nearest_src = _mm256_add_epi32(*src, *half_depth);
140   return _mm256_srai_epi32(nearest_src, depth);
141 }
142 
mm256_round_epi16(const __m256i * const src,const __m256i * const half_depth,const int depth)143 static INLINE __m256i mm256_round_epi16(const __m256i *const src,
144                                         const __m256i *const half_depth,
145                                         const int depth) {
146   const __m256i nearest_src = _mm256_adds_epi16(*src, *half_depth);
147   return _mm256_srai_epi16(nearest_src, depth);
148 }
149 
mm256_madd_add_epi32(const __m256i * const src_0,const __m256i * const src_1,const __m256i * const ker_0,const __m256i * const ker_1)150 static INLINE __m256i mm256_madd_add_epi32(const __m256i *const src_0,
151                                            const __m256i *const src_1,
152                                            const __m256i *const ker_0,
153                                            const __m256i *const ker_1) {
154   const __m256i tmp_0 = _mm256_madd_epi16(*src_0, *ker_0);
155   const __m256i tmp_1 = _mm256_madd_epi16(*src_1, *ker_1);
156   return _mm256_add_epi32(tmp_0, tmp_1);
157 }
158 
159 #undef MM256_BROADCASTSI128_SI256
160 
161 #endif  // VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_
162