1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <immintrin.h>
14
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17
18 #include "aom_ports/mem.h"
19 #include "aom/aom_integer.h"
20
21 #include "aom_dsp/aom_dsp_common.h"
22 #include "aom_dsp/aom_filter.h"
23 #include "aom_dsp/x86/obmc_intrinsic_sse4.h"
24 #include "aom_dsp/x86/synonyms.h"
25
26 ////////////////////////////////////////////////////////////////////////////////
27 // 8 bit
28 ////////////////////////////////////////////////////////////////////////////////
29
30 void aom_var_filter_block2d_bil_first_pass_ssse3(
31 const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
32 unsigned int pixel_step, unsigned int output_height,
33 unsigned int output_width, const uint8_t *filter);
34
35 void aom_var_filter_block2d_bil_second_pass_ssse3(
36 const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
37 unsigned int pixel_step, unsigned int output_height,
38 unsigned int output_width, const uint8_t *filter);
39
obmc_variance_w8n(const uint8_t * pre,const int pre_stride,const int32_t * wsrc,const int32_t * mask,unsigned int * const sse,int * const sum,const int w,const int h)40 static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
41 const int32_t *wsrc, const int32_t *mask,
42 unsigned int *const sse, int *const sum,
43 const int w, const int h) {
44 const int pre_step = pre_stride - w;
45 int n = 0;
46 __m128i v_sum_d = _mm_setzero_si128();
47 __m128i v_sse_d = _mm_setzero_si128();
48
49 assert(w >= 8);
50 assert(IS_POWER_OF_TWO(w));
51 assert(IS_POWER_OF_TWO(h));
52
53 do {
54 const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
55 const __m128i v_m1_d = xx_load_128(mask + n + 4);
56 const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
57 const __m128i v_p0_b = xx_loadl_32(pre + n);
58 const __m128i v_m0_d = xx_load_128(mask + n);
59 const __m128i v_w0_d = xx_load_128(wsrc + n);
60
61 const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
62 const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
63
64 // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
65 // boundaries. We use pmaddwd, as it has lower latency on Haswell
66 // than pmulld but produces the same result with these inputs.
67 const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
68 const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
69
70 const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
71 const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
72
73 const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
74 const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
75 const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
76 const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
77
78 v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
79 v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
80 v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
81
82 n += 8;
83
84 if (n % w == 0) pre += pre_step;
85 } while (n < w * h);
86
87 *sum = xx_hsum_epi32_si32(v_sum_d);
88 *sse = xx_hsum_epi32_si32(v_sse_d);
89 }
90
91 #define OBMCVARWXH(W, H) \
92 unsigned int aom_obmc_variance##W##x##H##_sse4_1( \
93 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
94 const int32_t *mask, unsigned int *sse) { \
95 int sum; \
96 if (W == 4) { \
97 obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \
98 } else { \
99 obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
100 } \
101 return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
102 }
103
104 OBMCVARWXH(128, 128)
105 OBMCVARWXH(128, 64)
106 OBMCVARWXH(64, 128)
107 OBMCVARWXH(64, 64)
108 OBMCVARWXH(64, 32)
109 OBMCVARWXH(32, 64)
110 OBMCVARWXH(32, 32)
111 OBMCVARWXH(32, 16)
112 OBMCVARWXH(16, 32)
113 OBMCVARWXH(16, 16)
114 OBMCVARWXH(16, 8)
115 OBMCVARWXH(8, 16)
116 OBMCVARWXH(8, 8)
117 OBMCVARWXH(8, 4)
118 OBMCVARWXH(4, 8)
119 OBMCVARWXH(4, 4)
120 OBMCVARWXH(4, 16)
121 OBMCVARWXH(16, 4)
122 OBMCVARWXH(8, 32)
123 OBMCVARWXH(32, 8)
124 OBMCVARWXH(16, 64)
125 OBMCVARWXH(64, 16)
126
127 #include "config/aom_dsp_rtcd.h"
128
129 #define OBMC_SUBPIX_VAR(W, H) \
130 uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1( \
131 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
132 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
133 uint16_t fdata3[(H + 1) * W]; \
134 uint8_t temp2[H * W]; \
135 \
136 aom_var_filter_block2d_bil_first_pass_ssse3( \
137 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
138 aom_var_filter_block2d_bil_second_pass_ssse3( \
139 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
140 \
141 return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse); \
142 }
143
144 OBMC_SUBPIX_VAR(128, 128)
145 OBMC_SUBPIX_VAR(128, 64)
146 OBMC_SUBPIX_VAR(64, 128)
147 OBMC_SUBPIX_VAR(64, 64)
148 OBMC_SUBPIX_VAR(64, 32)
149 OBMC_SUBPIX_VAR(32, 64)
150 OBMC_SUBPIX_VAR(32, 32)
151 OBMC_SUBPIX_VAR(32, 16)
152 OBMC_SUBPIX_VAR(16, 32)
153 OBMC_SUBPIX_VAR(16, 16)
154 OBMC_SUBPIX_VAR(16, 8)
155 OBMC_SUBPIX_VAR(8, 16)
156 OBMC_SUBPIX_VAR(8, 8)
157 OBMC_SUBPIX_VAR(8, 4)
158 OBMC_SUBPIX_VAR(4, 8)
159 OBMC_SUBPIX_VAR(4, 4)
160 OBMC_SUBPIX_VAR(4, 16)
161 OBMC_SUBPIX_VAR(16, 4)
162 OBMC_SUBPIX_VAR(8, 32)
163 OBMC_SUBPIX_VAR(32, 8)
164 OBMC_SUBPIX_VAR(16, 64)
165 OBMC_SUBPIX_VAR(64, 16)
166
167 ////////////////////////////////////////////////////////////////////////////////
168 // High bit-depth
169 ////////////////////////////////////////////////////////////////////////////////
170 #if CONFIG_AV1_HIGHBITDEPTH
hbd_obmc_variance_w4(const uint8_t * pre8,const int pre_stride,const int32_t * wsrc,const int32_t * mask,uint64_t * const sse,int64_t * const sum,const int h)171 static INLINE void hbd_obmc_variance_w4(
172 const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
173 const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) {
174 const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
175 const int pre_step = pre_stride - 4;
176 int n = 0;
177 __m128i v_sum_d = _mm_setzero_si128();
178 __m128i v_sse_d = _mm_setzero_si128();
179
180 assert(IS_POWER_OF_TWO(h));
181
182 do {
183 const __m128i v_p_w = xx_loadl_64(pre + n);
184 const __m128i v_m_d = xx_load_128(mask + n);
185 const __m128i v_w_d = xx_load_128(wsrc + n);
186
187 const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
188
189 // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
190 // boundaries. We use pmaddwd, as it has lower latency on Haswell
191 // than pmulld but produces the same result with these inputs.
192 const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
193
194 const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
195 const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
196 const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
197
198 v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
199 v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
200
201 n += 4;
202
203 if (n % 4 == 0) pre += pre_step;
204 } while (n < 4 * h);
205
206 *sum = xx_hsum_epi32_si32(v_sum_d);
207 *sse = xx_hsum_epi32_si32(v_sse_d);
208 }
209
hbd_obmc_variance_w8n(const uint8_t * pre8,const int pre_stride,const int32_t * wsrc,const int32_t * mask,uint64_t * const sse,int64_t * const sum,const int w,const int h)210 static INLINE void hbd_obmc_variance_w8n(
211 const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
212 const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w,
213 const int h) {
214 const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
215 const int pre_step = pre_stride - w;
216 int n = 0;
217 __m128i v_sum_d = _mm_setzero_si128();
218 __m128i v_sse_d = _mm_setzero_si128();
219
220 assert(w >= 8);
221 assert(IS_POWER_OF_TWO(w));
222 assert(IS_POWER_OF_TWO(h));
223
224 do {
225 const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
226 const __m128i v_m1_d = xx_load_128(mask + n + 4);
227 const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
228 const __m128i v_p0_w = xx_loadl_64(pre + n);
229 const __m128i v_m0_d = xx_load_128(mask + n);
230 const __m128i v_w0_d = xx_load_128(wsrc + n);
231
232 const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
233 const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
234
235 // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
236 // boundaries. We use pmaddwd, as it has lower latency on Haswell
237 // than pmulld but produces the same result with these inputs.
238 const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
239 const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
240
241 const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
242 const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
243
244 const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
245 const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
246 const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
247 const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
248
249 v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
250 v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
251 v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
252
253 n += 8;
254
255 if (n % w == 0) pre += pre_step;
256 } while (n < w * h);
257
258 *sum += xx_hsum_epi32_si64(v_sum_d);
259 *sse += xx_hsum_epi32_si64(v_sse_d);
260 }
261
highbd_8_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)262 static INLINE void highbd_8_obmc_variance(const uint8_t *pre8, int pre_stride,
263 const int32_t *wsrc,
264 const int32_t *mask, int w, int h,
265 unsigned int *sse, int *sum) {
266 int64_t sum64 = 0;
267 uint64_t sse64 = 0;
268 if (w == 4) {
269 hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
270 } else {
271 hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
272 }
273 *sum = (int)sum64;
274 *sse = (unsigned int)sse64;
275 }
276
highbd_10_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)277 static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
278 const int32_t *wsrc,
279 const int32_t *mask, int w, int h,
280 unsigned int *sse, int *sum) {
281 int64_t sum64 = 0;
282 uint64_t sse64 = 0;
283 if (w == 4) {
284 hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
285 } else if (w < 128 || h < 128) {
286 hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
287 } else {
288 assert(w == 128 && h == 128);
289
290 do {
291 hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
292 64);
293 pre8 += 64 * pre_stride;
294 wsrc += 64 * w;
295 mask += 64 * w;
296 h -= 64;
297 } while (h > 0);
298 }
299 *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
300 *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
301 }
302
highbd_12_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)303 static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
304 const int32_t *wsrc,
305 const int32_t *mask, int w, int h,
306 unsigned int *sse, int *sum) {
307 int64_t sum64 = 0;
308 uint64_t sse64 = 0;
309 int max_pel_allowed_per_ovf = 512;
310 if (w == 4) {
311 hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
312 } else if (w * h <= max_pel_allowed_per_ovf) {
313 hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
314 } else {
315 int h_per_ovf = max_pel_allowed_per_ovf / w;
316
317 assert(max_pel_allowed_per_ovf % w == 0);
318 do {
319 hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
320 h_per_ovf);
321 pre8 += h_per_ovf * pre_stride;
322 wsrc += h_per_ovf * w;
323 mask += h_per_ovf * w;
324 h -= h_per_ovf;
325 } while (h > 0);
326 }
327 *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
328 *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
329 }
330
331 #define HBD_OBMCVARWXH(W, H) \
332 unsigned int aom_highbd_8_obmc_variance##W##x##H##_sse4_1( \
333 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
334 const int32_t *mask, unsigned int *sse) { \
335 int sum; \
336 highbd_8_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
337 return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
338 } \
339 \
340 unsigned int aom_highbd_10_obmc_variance##W##x##H##_sse4_1( \
341 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
342 const int32_t *mask, unsigned int *sse) { \
343 int sum; \
344 int64_t var; \
345 highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
346 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
347 return (var >= 0) ? (uint32_t)var : 0; \
348 } \
349 \
350 unsigned int aom_highbd_12_obmc_variance##W##x##H##_sse4_1( \
351 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
352 const int32_t *mask, unsigned int *sse) { \
353 int sum; \
354 int64_t var; \
355 highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
356 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
357 return (var >= 0) ? (uint32_t)var : 0; \
358 }
359
360 HBD_OBMCVARWXH(128, 128)
361 HBD_OBMCVARWXH(128, 64)
362 HBD_OBMCVARWXH(64, 128)
363 HBD_OBMCVARWXH(64, 64)
364 HBD_OBMCVARWXH(64, 32)
365 HBD_OBMCVARWXH(32, 64)
366 HBD_OBMCVARWXH(32, 32)
367 HBD_OBMCVARWXH(32, 16)
368 HBD_OBMCVARWXH(16, 32)
369 HBD_OBMCVARWXH(16, 16)
370 HBD_OBMCVARWXH(16, 8)
371 HBD_OBMCVARWXH(8, 16)
372 HBD_OBMCVARWXH(8, 8)
373 HBD_OBMCVARWXH(8, 4)
374 HBD_OBMCVARWXH(4, 8)
375 HBD_OBMCVARWXH(4, 4)
376 HBD_OBMCVARWXH(4, 16)
377 HBD_OBMCVARWXH(16, 4)
378 HBD_OBMCVARWXH(8, 32)
379 HBD_OBMCVARWXH(32, 8)
380 HBD_OBMCVARWXH(16, 64)
381 HBD_OBMCVARWXH(64, 16)
382 #endif // CONFIG_AV1_HIGHBITDEPTH
383