• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <immintrin.h>
14 
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17 
18 #include "aom_ports/mem.h"
19 #include "aom/aom_integer.h"
20 
21 #include "aom_dsp/aom_dsp_common.h"
22 #include "aom_dsp/aom_filter.h"
23 #include "aom_dsp/x86/obmc_intrinsic_sse4.h"
24 #include "aom_dsp/x86/synonyms.h"
25 
26 ////////////////////////////////////////////////////////////////////////////////
27 // 8 bit
28 ////////////////////////////////////////////////////////////////////////////////
29 
30 void aom_var_filter_block2d_bil_first_pass_ssse3(
31     const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
32     unsigned int pixel_step, unsigned int output_height,
33     unsigned int output_width, const uint8_t *filter);
34 
35 void aom_var_filter_block2d_bil_second_pass_ssse3(
36     const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
37     unsigned int pixel_step, unsigned int output_height,
38     unsigned int output_width, const uint8_t *filter);
39 
obmc_variance_w8n(const uint8_t * pre,const int pre_stride,const int32_t * wsrc,const int32_t * mask,unsigned int * const sse,int * const sum,const int w,const int h)40 static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
41                                      const int32_t *wsrc, const int32_t *mask,
42                                      unsigned int *const sse, int *const sum,
43                                      const int w, const int h) {
44   const int pre_step = pre_stride - w;
45   int n = 0;
46   __m128i v_sum_d = _mm_setzero_si128();
47   __m128i v_sse_d = _mm_setzero_si128();
48 
49   assert(w >= 8);
50   assert(IS_POWER_OF_TWO(w));
51   assert(IS_POWER_OF_TWO(h));
52 
53   do {
54     const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
55     const __m128i v_m1_d = xx_load_128(mask + n + 4);
56     const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
57     const __m128i v_p0_b = xx_loadl_32(pre + n);
58     const __m128i v_m0_d = xx_load_128(mask + n);
59     const __m128i v_w0_d = xx_load_128(wsrc + n);
60 
61     const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
62     const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
63 
64     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
65     // boundaries. We use pmaddwd, as it has lower latency on Haswell
66     // than pmulld but produces the same result with these inputs.
67     const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
68     const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
69 
70     const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
71     const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
72 
73     const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
74     const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
75     const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
76     const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
77 
78     v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
79     v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
80     v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
81 
82     n += 8;
83 
84     if (n % w == 0) pre += pre_step;
85   } while (n < w * h);
86 
87   *sum = xx_hsum_epi32_si32(v_sum_d);
88   *sse = xx_hsum_epi32_si32(v_sse_d);
89 }
90 
91 #define OBMCVARWXH(W, H)                                               \
92   unsigned int aom_obmc_variance##W##x##H##_sse4_1(                    \
93       const uint8_t *pre, int pre_stride, const int32_t *wsrc,         \
94       const int32_t *mask, unsigned int *sse) {                        \
95     int sum;                                                           \
96     if (W == 4) {                                                      \
97       obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H);     \
98     } else {                                                           \
99       obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
100     }                                                                  \
101     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));      \
102   }
103 
104 OBMCVARWXH(128, 128)
105 OBMCVARWXH(128, 64)
106 OBMCVARWXH(64, 128)
107 OBMCVARWXH(64, 64)
108 OBMCVARWXH(64, 32)
109 OBMCVARWXH(32, 64)
110 OBMCVARWXH(32, 32)
111 OBMCVARWXH(32, 16)
112 OBMCVARWXH(16, 32)
113 OBMCVARWXH(16, 16)
114 OBMCVARWXH(16, 8)
115 OBMCVARWXH(8, 16)
116 OBMCVARWXH(8, 8)
117 OBMCVARWXH(8, 4)
118 OBMCVARWXH(4, 8)
119 OBMCVARWXH(4, 4)
120 OBMCVARWXH(4, 16)
121 OBMCVARWXH(16, 4)
122 OBMCVARWXH(8, 32)
123 OBMCVARWXH(32, 8)
124 OBMCVARWXH(16, 64)
125 OBMCVARWXH(64, 16)
126 
127 #include "config/aom_dsp_rtcd.h"
128 
129 #define OBMC_SUBPIX_VAR(W, H)                                                \
130   uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1(                    \
131       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,          \
132       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {         \
133     uint16_t fdata3[(H + 1) * W];                                            \
134     uint8_t temp2[H * W];                                                    \
135                                                                              \
136     aom_var_filter_block2d_bil_first_pass_ssse3(                             \
137         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
138     aom_var_filter_block2d_bil_second_pass_ssse3(                            \
139         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
140                                                                              \
141     return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse);   \
142   }
143 
144 OBMC_SUBPIX_VAR(128, 128)
145 OBMC_SUBPIX_VAR(128, 64)
146 OBMC_SUBPIX_VAR(64, 128)
147 OBMC_SUBPIX_VAR(64, 64)
148 OBMC_SUBPIX_VAR(64, 32)
149 OBMC_SUBPIX_VAR(32, 64)
150 OBMC_SUBPIX_VAR(32, 32)
151 OBMC_SUBPIX_VAR(32, 16)
152 OBMC_SUBPIX_VAR(16, 32)
153 OBMC_SUBPIX_VAR(16, 16)
154 OBMC_SUBPIX_VAR(16, 8)
155 OBMC_SUBPIX_VAR(8, 16)
156 OBMC_SUBPIX_VAR(8, 8)
157 OBMC_SUBPIX_VAR(8, 4)
158 OBMC_SUBPIX_VAR(4, 8)
159 OBMC_SUBPIX_VAR(4, 4)
160 OBMC_SUBPIX_VAR(4, 16)
161 OBMC_SUBPIX_VAR(16, 4)
162 OBMC_SUBPIX_VAR(8, 32)
163 OBMC_SUBPIX_VAR(32, 8)
164 OBMC_SUBPIX_VAR(16, 64)
165 OBMC_SUBPIX_VAR(64, 16)
166 
167 ////////////////////////////////////////////////////////////////////////////////
168 // High bit-depth
169 ////////////////////////////////////////////////////////////////////////////////
170 #if CONFIG_AV1_HIGHBITDEPTH
hbd_obmc_variance_w4(const uint8_t * pre8,const int pre_stride,const int32_t * wsrc,const int32_t * mask,uint64_t * const sse,int64_t * const sum,const int h)171 static INLINE void hbd_obmc_variance_w4(
172     const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
173     const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) {
174   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
175   const int pre_step = pre_stride - 4;
176   int n = 0;
177   __m128i v_sum_d = _mm_setzero_si128();
178   __m128i v_sse_d = _mm_setzero_si128();
179 
180   assert(IS_POWER_OF_TWO(h));
181 
182   do {
183     const __m128i v_p_w = xx_loadl_64(pre + n);
184     const __m128i v_m_d = xx_load_128(mask + n);
185     const __m128i v_w_d = xx_load_128(wsrc + n);
186 
187     const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
188 
189     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
190     // boundaries. We use pmaddwd, as it has lower latency on Haswell
191     // than pmulld but produces the same result with these inputs.
192     const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
193 
194     const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
195     const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
196     const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
197 
198     v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
199     v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
200 
201     n += 4;
202 
203     if (n % 4 == 0) pre += pre_step;
204   } while (n < 4 * h);
205 
206   *sum = xx_hsum_epi32_si32(v_sum_d);
207   *sse = xx_hsum_epi32_si32(v_sse_d);
208 }
209 
hbd_obmc_variance_w8n(const uint8_t * pre8,const int pre_stride,const int32_t * wsrc,const int32_t * mask,uint64_t * const sse,int64_t * const sum,const int w,const int h)210 static INLINE void hbd_obmc_variance_w8n(
211     const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
212     const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w,
213     const int h) {
214   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
215   const int pre_step = pre_stride - w;
216   int n = 0;
217   __m128i v_sum_d = _mm_setzero_si128();
218   __m128i v_sse_d = _mm_setzero_si128();
219 
220   assert(w >= 8);
221   assert(IS_POWER_OF_TWO(w));
222   assert(IS_POWER_OF_TWO(h));
223 
224   do {
225     const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
226     const __m128i v_m1_d = xx_load_128(mask + n + 4);
227     const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
228     const __m128i v_p0_w = xx_loadl_64(pre + n);
229     const __m128i v_m0_d = xx_load_128(mask + n);
230     const __m128i v_w0_d = xx_load_128(wsrc + n);
231 
232     const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
233     const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
234 
235     // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
236     // boundaries. We use pmaddwd, as it has lower latency on Haswell
237     // than pmulld but produces the same result with these inputs.
238     const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
239     const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
240 
241     const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
242     const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
243 
244     const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
245     const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
246     const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
247     const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
248 
249     v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
250     v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
251     v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
252 
253     n += 8;
254 
255     if (n % w == 0) pre += pre_step;
256   } while (n < w * h);
257 
258   *sum += xx_hsum_epi32_si64(v_sum_d);
259   *sse += xx_hsum_epi32_si64(v_sse_d);
260 }
261 
highbd_8_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)262 static INLINE void highbd_8_obmc_variance(const uint8_t *pre8, int pre_stride,
263                                           const int32_t *wsrc,
264                                           const int32_t *mask, int w, int h,
265                                           unsigned int *sse, int *sum) {
266   int64_t sum64 = 0;
267   uint64_t sse64 = 0;
268   if (w == 4) {
269     hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
270   } else {
271     hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
272   }
273   *sum = (int)sum64;
274   *sse = (unsigned int)sse64;
275 }
276 
highbd_10_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)277 static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
278                                            const int32_t *wsrc,
279                                            const int32_t *mask, int w, int h,
280                                            unsigned int *sse, int *sum) {
281   int64_t sum64 = 0;
282   uint64_t sse64 = 0;
283   if (w == 4) {
284     hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
285   } else if (w < 128 || h < 128) {
286     hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
287   } else {
288     assert(w == 128 && h == 128);
289 
290     do {
291       hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
292                             64);
293       pre8 += 64 * pre_stride;
294       wsrc += 64 * w;
295       mask += 64 * w;
296       h -= 64;
297     } while (h > 0);
298   }
299   *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
300   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
301 }
302 
highbd_12_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)303 static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
304                                            const int32_t *wsrc,
305                                            const int32_t *mask, int w, int h,
306                                            unsigned int *sse, int *sum) {
307   int64_t sum64 = 0;
308   uint64_t sse64 = 0;
309   int max_pel_allowed_per_ovf = 512;
310   if (w == 4) {
311     hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
312   } else if (w * h <= max_pel_allowed_per_ovf) {
313     hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
314   } else {
315     int h_per_ovf = max_pel_allowed_per_ovf / w;
316 
317     assert(max_pel_allowed_per_ovf % w == 0);
318     do {
319       hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
320                             h_per_ovf);
321       pre8 += h_per_ovf * pre_stride;
322       wsrc += h_per_ovf * w;
323       mask += h_per_ovf * w;
324       h -= h_per_ovf;
325     } while (h > 0);
326   }
327   *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
328   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
329 }
330 
331 #define HBD_OBMCVARWXH(W, H)                                               \
332   unsigned int aom_highbd_8_obmc_variance##W##x##H##_sse4_1(               \
333       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
334       const int32_t *mask, unsigned int *sse) {                            \
335     int sum;                                                               \
336     highbd_8_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);  \
337     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));          \
338   }                                                                        \
339                                                                            \
340   unsigned int aom_highbd_10_obmc_variance##W##x##H##_sse4_1(              \
341       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
342       const int32_t *mask, unsigned int *sse) {                            \
343     int sum;                                                               \
344     int64_t var;                                                           \
345     highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
346     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
347     return (var >= 0) ? (uint32_t)var : 0;                                 \
348   }                                                                        \
349                                                                            \
350   unsigned int aom_highbd_12_obmc_variance##W##x##H##_sse4_1(              \
351       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
352       const int32_t *mask, unsigned int *sse) {                            \
353     int sum;                                                               \
354     int64_t var;                                                           \
355     highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
356     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
357     return (var >= 0) ? (uint32_t)var : 0;                                 \
358   }
359 
360 HBD_OBMCVARWXH(128, 128)
361 HBD_OBMCVARWXH(128, 64)
362 HBD_OBMCVARWXH(64, 128)
363 HBD_OBMCVARWXH(64, 64)
364 HBD_OBMCVARWXH(64, 32)
365 HBD_OBMCVARWXH(32, 64)
366 HBD_OBMCVARWXH(32, 32)
367 HBD_OBMCVARWXH(32, 16)
368 HBD_OBMCVARWXH(16, 32)
369 HBD_OBMCVARWXH(16, 16)
370 HBD_OBMCVARWXH(16, 8)
371 HBD_OBMCVARWXH(8, 16)
372 HBD_OBMCVARWXH(8, 8)
373 HBD_OBMCVARWXH(8, 4)
374 HBD_OBMCVARWXH(4, 8)
375 HBD_OBMCVARWXH(4, 4)
376 HBD_OBMCVARWXH(4, 16)
377 HBD_OBMCVARWXH(16, 4)
378 HBD_OBMCVARWXH(8, 32)
379 HBD_OBMCVARWXH(32, 8)
380 HBD_OBMCVARWXH(16, 64)
381 HBD_OBMCVARWXH(64, 16)
382 #endif  // CONFIG_AV1_HIGHBITDEPTH
383