1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_
12 #define VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_
13
14 #include <immintrin.h> // AVX2
15
16 #include "./vpx_config.h"
17
18 #if defined(__clang__)
19 #if (__clang_major__ > 0 && __clang_major__ < 3) || \
20 (__clang_major__ == 3 && __clang_minor__ <= 3) || \
21 (defined(__APPLE__) && defined(__apple_build_version__) && \
22 ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
23 (__clang_major__ == 5 && __clang_minor__ == 0)))
24 #define MM256_BROADCASTSI128_SI256(x) \
25 _mm_broadcastsi128_si256((__m128i const *)&(x))
26 #else // clang > 3.3, and not 5.0 on macosx.
27 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
28 #endif // clang <= 3.3
29 #elif defined(__GNUC__)
30 #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
31 #define MM256_BROADCASTSI128_SI256(x) \
32 _mm_broadcastsi128_si256((__m128i const *)&(x))
33 #elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
34 #define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
35 #else // gcc > 4.7
36 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
37 #endif // gcc <= 4.6
38 #else // !(gcc || clang)
39 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
40 #endif // __clang__
41
shuffle_filter_avx2(const int16_t * const filter,__m256i * const f)42 static INLINE void shuffle_filter_avx2(const int16_t *const filter,
43 __m256i *const f) {
44 const __m256i f_values =
45 MM256_BROADCASTSI128_SI256(_mm_load_si128((const __m128i *)filter));
46 // pack and duplicate the filter values
47 f[0] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0200u));
48 f[1] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0604u));
49 f[2] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0a08u));
50 f[3] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0e0cu));
51 }
52
convolve8_16_avx2(const __m256i * const s,const __m256i * const f)53 static INLINE __m256i convolve8_16_avx2(const __m256i *const s,
54 const __m256i *const f) {
55 // multiply 2 adjacent elements with the filter and add the result
56 const __m256i k_64 = _mm256_set1_epi16(1 << 6);
57 const __m256i x0 = _mm256_maddubs_epi16(s[0], f[0]);
58 const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]);
59 const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]);
60 const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]);
61 __m256i sum1, sum2;
62
63 // sum the results together, saturating only on the final step
64 // adding x0 with x2 and x1 with x3 is the only order that prevents
65 // outranges for all filters
66 sum1 = _mm256_add_epi16(x0, x2);
67 sum2 = _mm256_add_epi16(x1, x3);
68 // add the rounding offset early to avoid another saturated add
69 sum1 = _mm256_add_epi16(sum1, k_64);
70 sum1 = _mm256_adds_epi16(sum1, sum2);
71 // round and shift by 7 bit each 16 bit
72 sum1 = _mm256_srai_epi16(sum1, 7);
73 return sum1;
74 }
75
convolve8_8_avx2(const __m256i * const s,const __m256i * const f)76 static INLINE __m128i convolve8_8_avx2(const __m256i *const s,
77 const __m256i *const f) {
78 // multiply 2 adjacent elements with the filter and add the result
79 const __m128i k_64 = _mm_set1_epi16(1 << 6);
80 const __m128i x0 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[0]),
81 _mm256_castsi256_si128(f[0]));
82 const __m128i x1 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[1]),
83 _mm256_castsi256_si128(f[1]));
84 const __m128i x2 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[2]),
85 _mm256_castsi256_si128(f[2]));
86 const __m128i x3 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[3]),
87 _mm256_castsi256_si128(f[3]));
88 __m128i sum1, sum2;
89
90 // sum the results together, saturating only on the final step
91 // adding x0 with x2 and x1 with x3 is the only order that prevents
92 // outranges for all filters
93 sum1 = _mm_add_epi16(x0, x2);
94 sum2 = _mm_add_epi16(x1, x3);
95 // add the rounding offset early to avoid another saturated add
96 sum1 = _mm_add_epi16(sum1, k_64);
97 sum1 = _mm_adds_epi16(sum1, sum2);
98 // shift by 7 bit each 16 bit
99 sum1 = _mm_srai_epi16(sum1, 7);
100 return sum1;
101 }
102
mm256_loadu2_si128(const void * lo,const void * hi)103 static INLINE __m256i mm256_loadu2_si128(const void *lo, const void *hi) {
104 const __m256i tmp =
105 _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)lo));
106 return _mm256_inserti128_si256(tmp, _mm_loadu_si128((const __m128i *)hi), 1);
107 }
108
mm256_loadu2_epi64(const void * lo,const void * hi)109 static INLINE __m256i mm256_loadu2_epi64(const void *lo, const void *hi) {
110 const __m256i tmp =
111 _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)lo));
112 return _mm256_inserti128_si256(tmp, _mm_loadl_epi64((const __m128i *)hi), 1);
113 }
114
mm256_store2_si128(__m128i * const dst_ptr_1,__m128i * const dst_ptr_2,const __m256i * const src)115 static INLINE void mm256_store2_si128(__m128i *const dst_ptr_1,
116 __m128i *const dst_ptr_2,
117 const __m256i *const src) {
118 _mm_store_si128(dst_ptr_1, _mm256_castsi256_si128(*src));
119 _mm_store_si128(dst_ptr_2, _mm256_extractf128_si256(*src, 1));
120 }
121
mm256_storeu2_epi64(__m128i * const dst_ptr_1,__m128i * const dst_ptr_2,const __m256i * const src)122 static INLINE void mm256_storeu2_epi64(__m128i *const dst_ptr_1,
123 __m128i *const dst_ptr_2,
124 const __m256i *const src) {
125 _mm_storel_epi64(dst_ptr_1, _mm256_castsi256_si128(*src));
126 _mm_storel_epi64(dst_ptr_2, _mm256_extractf128_si256(*src, 1));
127 }
128
mm256_storeu2_epi32(__m128i * const dst_ptr_1,__m128i * const dst_ptr_2,const __m256i * const src)129 static INLINE void mm256_storeu2_epi32(__m128i *const dst_ptr_1,
130 __m128i *const dst_ptr_2,
131 const __m256i *const src) {
132 *((int *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src));
133 *((int *)(dst_ptr_2)) = _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1));
134 }
135
mm256_round_epi32(const __m256i * const src,const __m256i * const half_depth,const int depth)136 static INLINE __m256i mm256_round_epi32(const __m256i *const src,
137 const __m256i *const half_depth,
138 const int depth) {
139 const __m256i nearest_src = _mm256_add_epi32(*src, *half_depth);
140 return _mm256_srai_epi32(nearest_src, depth);
141 }
142
mm256_round_epi16(const __m256i * const src,const __m256i * const half_depth,const int depth)143 static INLINE __m256i mm256_round_epi16(const __m256i *const src,
144 const __m256i *const half_depth,
145 const int depth) {
146 const __m256i nearest_src = _mm256_adds_epi16(*src, *half_depth);
147 return _mm256_srai_epi16(nearest_src, depth);
148 }
149
mm256_madd_add_epi32(const __m256i * const src_0,const __m256i * const src_1,const __m256i * const ker_0,const __m256i * const ker_1)150 static INLINE __m256i mm256_madd_add_epi32(const __m256i *const src_0,
151 const __m256i *const src_1,
152 const __m256i *const ker_0,
153 const __m256i *const ker_1) {
154 const __m256i tmp_0 = _mm256_madd_epi16(*src_0, *ker_0);
155 const __m256i tmp_1 = _mm256_madd_epi16(*src_1, *ker_1);
156 return _mm256_add_epi32(tmp_0, tmp_1);
157 }
158
159 #undef MM256_BROADCASTSI128_SI256
160
161 #endif // VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_
162