• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <immintrin.h>
14 
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17 
18 #include "aom_ports/mem.h"
19 #include "aom/aom_integer.h"
20 
21 #include "aom_dsp/aom_dsp_common.h"
22 #include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
23 #include "aom_dsp/x86/synonyms.h"
24 
25 ////////////////////////////////////////////////////////////////////////////////
26 // 8 bit
27 ////////////////////////////////////////////////////////////////////////////////
28 
obmc_sad_w4(const uint8_t * pre,const int pre_stride,const int32_t * wsrc,const int32_t * mask,const int height)29 static AOM_FORCE_INLINE unsigned int obmc_sad_w4(const uint8_t *pre,
30                                                  const int pre_stride,
31                                                  const int32_t *wsrc,
32                                                  const int32_t *mask,
33                                                  const int height) {
34   const int pre_step = pre_stride - 4;
35   int n = 0;
36   __m128i v_sad_d = _mm_setzero_si128();
37 
38   do {
39     const __m128i v_p_b = xx_loadl_32(pre + n);
40     const __m128i v_m_d = xx_load_128(mask + n);
41     const __m128i v_w_d = xx_load_128(wsrc + n);
42 
43     const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
44 
45     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
46     // boundaries. We use pmaddwd, as it has lower latency on Haswell
47     // than pmulld but produces the same result with these inputs.
48     const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
49 
50     const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
51     const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
52 
53     // Rounded absolute difference
54     const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
55 
56     v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
57 
58     n += 4;
59 
60     if (n % 4 == 0) pre += pre_step;
61   } while (n < 4 * height);
62 
63   return xx_hsum_epi32_si32(v_sad_d);
64 }
65 
obmc_sad_w8n(const uint8_t * pre,const int pre_stride,const int32_t * wsrc,const int32_t * mask,const int width,const int height)66 static AOM_FORCE_INLINE unsigned int obmc_sad_w8n(
67     const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
68     const int32_t *mask, const int width, const int height) {
69   const int pre_step = pre_stride - width;
70   int n = 0;
71   __m128i v_sad_d = _mm_setzero_si128();
72 
73   assert(width >= 8);
74   assert(IS_POWER_OF_TWO(width));
75 
76   do {
77     const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
78     const __m128i v_m1_d = xx_load_128(mask + n + 4);
79     const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
80     const __m128i v_p0_b = xx_loadl_32(pre + n);
81     const __m128i v_m0_d = xx_load_128(mask + n);
82     const __m128i v_w0_d = xx_load_128(wsrc + n);
83 
84     const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
85     const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
86 
87     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
88     // boundaries. We use pmaddwd, as it has lower latency on Haswell
89     // than pmulld but produces the same result with these inputs.
90     const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
91     const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
92 
93     const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
94     const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
95     const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
96     const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
97 
98     // Rounded absolute difference
99     const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
100     const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
101 
102     v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
103     v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
104 
105     n += 8;
106 
107     if (n % width == 0) pre += pre_step;
108   } while (n < width * height);
109 
110   return xx_hsum_epi32_si32(v_sad_d);
111 }
112 
113 #define OBMCSADWXH(w, h)                                       \
114   unsigned int aom_obmc_sad##w##x##h##_sse4_1(                 \
115       const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
116       const int32_t *msk) {                                    \
117     if (w == 4) {                                              \
118       return obmc_sad_w4(pre, pre_stride, wsrc, msk, h);       \
119     } else {                                                   \
120       return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h);   \
121     }                                                          \
122   }
123 
124 OBMCSADWXH(128, 128)
125 OBMCSADWXH(128, 64)
126 OBMCSADWXH(64, 128)
127 OBMCSADWXH(64, 64)
128 OBMCSADWXH(64, 32)
129 OBMCSADWXH(32, 64)
130 OBMCSADWXH(32, 32)
131 OBMCSADWXH(32, 16)
132 OBMCSADWXH(16, 32)
133 OBMCSADWXH(16, 16)
134 OBMCSADWXH(16, 8)
135 OBMCSADWXH(8, 16)
136 OBMCSADWXH(8, 8)
137 OBMCSADWXH(8, 4)
138 OBMCSADWXH(4, 8)
139 OBMCSADWXH(4, 4)
140 OBMCSADWXH(4, 16)
141 OBMCSADWXH(16, 4)
142 OBMCSADWXH(8, 32)
143 OBMCSADWXH(32, 8)
144 OBMCSADWXH(16, 64)
145 OBMCSADWXH(64, 16)
146 
147 ////////////////////////////////////////////////////////////////////////////////
148 // High bit-depth
149 ////////////////////////////////////////////////////////////////////////////////
150 
hbd_obmc_sad_w4(const uint8_t * pre8,const int pre_stride,const int32_t * wsrc,const int32_t * mask,const int height)151 static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
152                                                      const int pre_stride,
153                                                      const int32_t *wsrc,
154                                                      const int32_t *mask,
155                                                      const int height) {
156   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
157   const int pre_step = pre_stride - 4;
158   int n = 0;
159   __m128i v_sad_d = _mm_setzero_si128();
160 
161   do {
162     const __m128i v_p_w = xx_loadl_64(pre + n);
163     const __m128i v_m_d = xx_load_128(mask + n);
164     const __m128i v_w_d = xx_load_128(wsrc + n);
165 
166     const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
167 
168     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
169     // boundaries. We use pmaddwd, as it has lower latency on Haswell
170     // than pmulld but produces the same result with these inputs.
171     const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
172 
173     const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
174     const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
175 
176     // Rounded absolute difference
177     const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
178 
179     v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
180 
181     n += 4;
182 
183     if (n % 4 == 0) pre += pre_step;
184   } while (n < 4 * height);
185 
186   return xx_hsum_epi32_si32(v_sad_d);
187 }
188 
hbd_obmc_sad_w8n(const uint8_t * pre8,const int pre_stride,const int32_t * wsrc,const int32_t * mask,const int width,const int height)189 static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w8n(
190     const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
191     const int32_t *mask, const int width, const int height) {
192   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
193   const int pre_step = pre_stride - width;
194   int n = 0;
195   __m128i v_sad_d = _mm_setzero_si128();
196 
197   assert(width >= 8);
198   assert(IS_POWER_OF_TWO(width));
199 
200   do {
201     const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
202     const __m128i v_m1_d = xx_load_128(mask + n + 4);
203     const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
204     const __m128i v_p0_w = xx_loadl_64(pre + n);
205     const __m128i v_m0_d = xx_load_128(mask + n);
206     const __m128i v_w0_d = xx_load_128(wsrc + n);
207 
208     const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
209     const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
210 
211     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
212     // boundaries. We use pmaddwd, as it has lower latency on Haswell
213     // than pmulld but produces the same result with these inputs.
214     const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
215     const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
216 
217     const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
218     const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
219     const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
220     const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
221 
222     // Rounded absolute difference
223     const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
224     const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
225 
226     v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
227     v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
228 
229     n += 8;
230 
231     if (n % width == 0) pre += pre_step;
232   } while (n < width * height);
233 
234   return xx_hsum_epi32_si32(v_sad_d);
235 }
236 
237 #define HBD_OBMCSADWXH(w, h)                                      \
238   unsigned int aom_highbd_obmc_sad##w##x##h##_sse4_1(             \
239       const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
240       const int32_t *mask) {                                      \
241     if (w == 4) {                                                 \
242       return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h);     \
243     } else {                                                      \
244       return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \
245     }                                                             \
246   }
247 
248 HBD_OBMCSADWXH(128, 128)
249 HBD_OBMCSADWXH(128, 64)
250 HBD_OBMCSADWXH(64, 128)
251 HBD_OBMCSADWXH(64, 64)
252 HBD_OBMCSADWXH(64, 32)
253 HBD_OBMCSADWXH(32, 64)
254 HBD_OBMCSADWXH(32, 32)
255 HBD_OBMCSADWXH(32, 16)
256 HBD_OBMCSADWXH(16, 32)
257 HBD_OBMCSADWXH(16, 16)
258 HBD_OBMCSADWXH(16, 8)
259 HBD_OBMCSADWXH(8, 16)
260 HBD_OBMCSADWXH(8, 8)
261 HBD_OBMCSADWXH(8, 4)
262 HBD_OBMCSADWXH(4, 8)
263 HBD_OBMCSADWXH(4, 4)
264 HBD_OBMCSADWXH(4, 16)
265 HBD_OBMCSADWXH(16, 4)
266 HBD_OBMCSADWXH(8, 32)
267 HBD_OBMCSADWXH(32, 8)
268 HBD_OBMCSADWXH(16, 64)
269 HBD_OBMCSADWXH(64, 16)
270