• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <immintrin.h>
14 
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17 
18 #include "aom_ports/mem.h"
19 #include "aom/aom_integer.h"
20 
21 #include "aom_dsp/aom_dsp_common.h"
22 #include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
23 #include "aom_dsp/x86/synonyms.h"
24 
25 ////////////////////////////////////////////////////////////////////////////////
26 // 8 bit
27 ////////////////////////////////////////////////////////////////////////////////
28 
obmc_sad_w4_avx2(const uint8_t * pre,const int pre_stride,const int32_t * wsrc,const int32_t * mask,const int height)29 static INLINE unsigned int obmc_sad_w4_avx2(const uint8_t *pre,
30                                             const int pre_stride,
31                                             const int32_t *wsrc,
32                                             const int32_t *mask,
33                                             const int height) {
34   int n = 0;
35   __m256i v_sad_d = _mm256_setzero_si256();
36   const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
37 
38   do {
39     const __m128i v_p_b_0 = xx_loadl_32(pre);
40     const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride);
41     const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1);
42     const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
43     const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
44 
45     const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b);
46 
47     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
48     // boundaries. We use pmaddwd, as it has lower latency on Haswell
49     // than pmulld but produces the same result with these inputs.
50     const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
51 
52     const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
53     const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
54 
55     // Rounded absolute difference
56     const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
57     const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
58 
59     v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
60 
61     n += 8;
62     pre += pre_stride << 1;
63   } while (n < 8 * (height >> 1));
64 
65   __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
66   __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
67   v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
68   return xx_hsum_epi32_si32(v_sad_d_0);
69 }
70 
obmc_sad_w8n_avx2(const uint8_t * pre,const int pre_stride,const int32_t * wsrc,const int32_t * mask,const int width,const int height)71 static INLINE unsigned int obmc_sad_w8n_avx2(
72     const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
73     const int32_t *mask, const int width, const int height) {
74   const int pre_step = pre_stride - width;
75   int n = 0;
76   __m256i v_sad_d = _mm256_setzero_si256();
77   const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
78   assert(width >= 8);
79   assert(IS_POWER_OF_TWO(width));
80 
81   do {
82     const __m128i v_p0_b = xx_loadl_64(pre + n);
83     const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
84     const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
85 
86     const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b);
87 
88     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
89     // boundaries. We use pmaddwd, as it has lower latency on Haswell
90     // than pmulld but produces the same result with these inputs.
91     const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
92 
93     const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
94     const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
95 
96     // Rounded absolute difference
97     const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
98     const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
99 
100     v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
101 
102     n += 8;
103 
104     if ((n & (width - 1)) == 0) pre += pre_step;
105   } while (n < width * height);
106 
107   __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
108   __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
109   v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
110   return xx_hsum_epi32_si32(v_sad_d_0);
111 }
112 
113 #define OBMCSADWXH(w, h)                                          \
114   unsigned int aom_obmc_sad##w##x##h##_avx2(                      \
115       const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
116       const int32_t *msk) {                                       \
117     if (w == 4) {                                                 \
118       return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h);     \
119     } else {                                                      \
120       return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \
121     }                                                             \
122   }
123 
124 OBMCSADWXH(128, 128)
125 OBMCSADWXH(128, 64)
126 OBMCSADWXH(64, 128)
127 OBMCSADWXH(64, 64)
128 OBMCSADWXH(64, 32)
129 OBMCSADWXH(32, 64)
130 OBMCSADWXH(32, 32)
131 OBMCSADWXH(32, 16)
132 OBMCSADWXH(16, 32)
133 OBMCSADWXH(16, 16)
134 OBMCSADWXH(16, 8)
135 OBMCSADWXH(8, 16)
136 OBMCSADWXH(8, 8)
137 OBMCSADWXH(8, 4)
138 OBMCSADWXH(4, 8)
139 OBMCSADWXH(4, 4)
140 OBMCSADWXH(4, 16)
141 OBMCSADWXH(16, 4)
142 OBMCSADWXH(8, 32)
143 OBMCSADWXH(32, 8)
144 OBMCSADWXH(16, 64)
145 OBMCSADWXH(64, 16)
146 
147 ////////////////////////////////////////////////////////////////////////////////
148 // High bit-depth
149 ////////////////////////////////////////////////////////////////////////////////
150 
hbd_obmc_sad_w4_avx2(const uint8_t * pre8,const int pre_stride,const int32_t * wsrc,const int32_t * mask,const int height)151 static INLINE unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8,
152                                                 const int pre_stride,
153                                                 const int32_t *wsrc,
154                                                 const int32_t *mask,
155                                                 const int height) {
156   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
157   int n = 0;
158   __m256i v_sad_d = _mm256_setzero_si256();
159   const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
160   do {
161     const __m128i v_p_w_0 = xx_loadl_64(pre);
162     const __m128i v_p_w_1 = xx_loadl_64(pre + pre_stride);
163     const __m128i v_p_w = _mm_unpacklo_epi64(v_p_w_0, v_p_w_1);
164     const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
165     const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
166 
167     const __m256i v_p_d = _mm256_cvtepu16_epi32(v_p_w);
168 
169     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
170     // boundaries. We use pmaddwd, as it has lower latency on Haswell
171     // than pmulld but produces the same result with these inputs.
172     const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
173 
174     const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
175     const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
176 
177     // Rounded absolute difference
178 
179     const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
180     const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
181 
182     v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
183 
184     n += 8;
185 
186     pre += pre_stride << 1;
187   } while (n < 8 * (height >> 1));
188 
189   __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
190   __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
191   v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
192   return xx_hsum_epi32_si32(v_sad_d_0);
193 }
194 
hbd_obmc_sad_w8n_avx2(const uint8_t * pre8,const int pre_stride,const int32_t * wsrc,const int32_t * mask,const int width,const int height)195 static INLINE unsigned int hbd_obmc_sad_w8n_avx2(
196     const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
197     const int32_t *mask, const int width, const int height) {
198   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
199   const int pre_step = pre_stride - width;
200   int n = 0;
201   __m256i v_sad_d = _mm256_setzero_si256();
202   const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
203 
204   assert(width >= 8);
205   assert(IS_POWER_OF_TWO(width));
206 
207   do {
208     const __m128i v_p0_w = _mm_lddqu_si128((__m128i *)(pre + n));
209     const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
210     const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
211 
212     const __m256i v_p0_d = _mm256_cvtepu16_epi32(v_p0_w);
213 
214     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
215     // boundaries. We use pmaddwd, as it has lower latency on Haswell
216     // than pmulld but produces the same result with these inputs.
217     const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
218 
219     const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
220     const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
221 
222     // Rounded absolute difference
223     const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
224     const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
225 
226     v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
227 
228     n += 8;
229 
230     if (n % width == 0) pre += pre_step;
231   } while (n < width * height);
232 
233   __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
234   __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
235   v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
236   return xx_hsum_epi32_si32(v_sad_d_0);
237 }
238 
239 #define HBD_OBMCSADWXH(w, h)                                           \
240   unsigned int aom_highbd_obmc_sad##w##x##h##_avx2(                    \
241       const uint8_t *pre, int pre_stride, const int32_t *wsrc,         \
242       const int32_t *mask) {                                           \
243     if (w == 4) {                                                      \
244       return hbd_obmc_sad_w4_avx2(pre, pre_stride, wsrc, mask, h);     \
245     } else {                                                           \
246       return hbd_obmc_sad_w8n_avx2(pre, pre_stride, wsrc, mask, w, h); \
247     }                                                                  \
248   }
249 
250 HBD_OBMCSADWXH(128, 128)
251 HBD_OBMCSADWXH(128, 64)
252 HBD_OBMCSADWXH(64, 128)
253 HBD_OBMCSADWXH(64, 64)
254 HBD_OBMCSADWXH(64, 32)
255 HBD_OBMCSADWXH(32, 64)
256 HBD_OBMCSADWXH(32, 32)
257 HBD_OBMCSADWXH(32, 16)
258 HBD_OBMCSADWXH(16, 32)
259 HBD_OBMCSADWXH(16, 16)
260 HBD_OBMCSADWXH(16, 8)
261 HBD_OBMCSADWXH(8, 16)
262 HBD_OBMCSADWXH(8, 8)
263 HBD_OBMCSADWXH(8, 4)
264 HBD_OBMCSADWXH(4, 8)
265 HBD_OBMCSADWXH(4, 4)
266 HBD_OBMCSADWXH(4, 16)
267 HBD_OBMCSADWXH(16, 4)
268 HBD_OBMCSADWXH(8, 32)
269 HBD_OBMCSADWXH(32, 8)
270 HBD_OBMCSADWXH(16, 64)
271 HBD_OBMCSADWXH(64, 16)
272