• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 #include <immintrin.h>
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx/vpx_integer.h"
13 
calc_final(const __m256i sums_32)14 static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) {
15   const __m256i t0 = _mm256_add_epi32(sums_32, _mm256_srli_si256(sums_32, 8));
16   const __m256i t1 = _mm256_add_epi32(t0, _mm256_srli_si256(t0, 4));
17   const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t1),
18                                     _mm256_extractf128_si256(t1, 1));
19   return (unsigned int)_mm_cvtsi128_si32(sum);
20 }
21 
highbd_sad64xH(__m256i * sums_16,const uint16_t * src,int src_stride,uint16_t * ref,int ref_stride,int height)22 static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16,
23                                             const uint16_t *src, int src_stride,
24                                             uint16_t *ref, int ref_stride,
25                                             int height) {
26   int i;
27   for (i = 0; i < height; ++i) {
28     // load src and all ref[]
29     const __m256i s0 = _mm256_load_si256((const __m256i *)src);
30     const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
31     const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
32     const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
33     const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
34     const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
35     const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
36     const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
37     // absolute differences between every ref[] to src
38     const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
39     const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
40     const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(r2, s2));
41     const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(r3, s3));
42     // sum every abs diff
43     *sums_16 =
44         _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
45     *sums_16 =
46         _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
47 
48     src += src_stride;
49     ref += ref_stride;
50   }
51 }
52 
53 #define HIGHBD_SAD64XN(n)                                                    \
54   unsigned int vpx_highbd_sad64x##n##_avx2(                                  \
55       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
56       int ref_stride) {                                                      \
57     const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                      \
58     uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                            \
59     __m256i sums_32 = _mm256_setzero_si256();                                \
60     int i;                                                                   \
61                                                                              \
62     for (i = 0; i < (n / 2); ++i) {                                          \
63       __m256i sums_16 = _mm256_setzero_si256();                              \
64                                                                              \
65       highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2);         \
66                                                                              \
67       /* sums_16 will outrange after 2 rows, so add current sums_16 to       \
68        * sums_32*/                                                           \
69       sums_32 = _mm256_add_epi32(                                            \
70           sums_32,                                                           \
71           _mm256_add_epi32(                                                  \
72               _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),        \
73               _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
74                                                                              \
75       src += src_stride << 1;                                                \
76       ref += ref_stride << 1;                                                \
77     }                                                                        \
78     return calc_final(sums_32);                                              \
79   }
80 
81 // 64x64
82 HIGHBD_SAD64XN(64)
83 
84 // 64x32
85 HIGHBD_SAD64XN(32)
86 
highbd_sad32xH(__m256i * sums_16,const uint16_t * src,int src_stride,uint16_t * ref,int ref_stride,int height)87 static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
88                                             const uint16_t *src, int src_stride,
89                                             uint16_t *ref, int ref_stride,
90                                             int height) {
91   int i;
92   for (i = 0; i < height; ++i) {
93     // load src and all ref[]
94     const __m256i s0 = _mm256_load_si256((const __m256i *)src);
95     const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
96     const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
97     const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
98     // absolute differences between every ref[] to src
99     const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
100     const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
101     // sum every abs diff
102     *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
103     *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
104 
105     src += src_stride;
106     ref += ref_stride;
107   }
108 }
109 
110 #define HIGHBD_SAD32XN(n)                                                    \
111   unsigned int vpx_highbd_sad32x##n##_avx2(                                  \
112       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
113       int ref_stride) {                                                      \
114     const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                      \
115     uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                            \
116     __m256i sums_32 = _mm256_setzero_si256();                                \
117     int i;                                                                   \
118                                                                              \
119     for (i = 0; i < (n / 8); ++i) {                                          \
120       __m256i sums_16 = _mm256_setzero_si256();                              \
121                                                                              \
122       highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8);         \
123                                                                              \
124       /* sums_16 will outrange after 8 rows, so add current sums_16 to       \
125        * sums_32*/                                                           \
126       sums_32 = _mm256_add_epi32(                                            \
127           sums_32,                                                           \
128           _mm256_add_epi32(                                                  \
129               _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),        \
130               _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
131                                                                              \
132       src += src_stride << 3;                                                \
133       ref += ref_stride << 3;                                                \
134     }                                                                        \
135     return calc_final(sums_32);                                              \
136   }
137 
138 // 32x64
139 HIGHBD_SAD32XN(64)
140 
141 // 32x32
142 HIGHBD_SAD32XN(32)
143 
144 // 32x16
145 HIGHBD_SAD32XN(16)
146 
highbd_sad16xH(__m256i * sums_16,const uint16_t * src,int src_stride,uint16_t * ref,int ref_stride,int height)147 static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
148                                             const uint16_t *src, int src_stride,
149                                             uint16_t *ref, int ref_stride,
150                                             int height) {
151   int i;
152   for (i = 0; i < height; i += 2) {
153     // load src and all ref[]
154     const __m256i s0 = _mm256_load_si256((const __m256i *)src);
155     const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
156     const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
157     const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
158     // absolute differences between every ref[] to src
159     const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
160     const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
161     // sum every abs diff
162     *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
163     *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
164 
165     src += src_stride << 1;
166     ref += ref_stride << 1;
167   }
168 }
169 
vpx_highbd_sad16x32_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride)170 unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride,
171                                       const uint8_t *ref_ptr, int ref_stride) {
172   const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
173   uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
174   __m256i sums_32 = _mm256_setzero_si256();
175   int i;
176 
177   for (i = 0; i < 2; ++i) {
178     __m256i sums_16 = _mm256_setzero_si256();
179 
180     highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
181 
182     // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
183     sums_32 = _mm256_add_epi32(
184         sums_32,
185         _mm256_add_epi32(
186             _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
187             _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
188 
189     src += src_stride << 4;
190     ref += ref_stride << 4;
191   }
192   return calc_final(sums_32);
193 }
194 
vpx_highbd_sad16x16_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride)195 unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride,
196                                       const uint8_t *ref_ptr, int ref_stride) {
197   const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
198   uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
199   __m256i sums_16 = _mm256_setzero_si256();
200 
201   highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
202 
203   {
204     const __m256i sums_32 = _mm256_add_epi32(
205         _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
206         _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
207     return calc_final(sums_32);
208   }
209 }
210 
vpx_highbd_sad16x8_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride)211 unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride,
212                                      const uint8_t *ref_ptr, int ref_stride) {
213   const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
214   uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
215   __m256i sums_16 = _mm256_setzero_si256();
216 
217   highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8);
218 
219   {
220     const __m256i sums_32 = _mm256_add_epi32(
221         _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
222         _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
223     return calc_final(sums_32);
224   }
225 }
226 
227 // AVG -------------------------------------------------------------------------
highbd_sad64xH_avg(__m256i * sums_16,const uint16_t * src,int src_stride,uint16_t * ref,int ref_stride,uint16_t * sec,int height)228 static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16,
229                                                 const uint16_t *src,
230                                                 int src_stride, uint16_t *ref,
231                                                 int ref_stride, uint16_t *sec,
232                                                 int height) {
233   int i;
234   for (i = 0; i < height; ++i) {
235     // load src and all ref[]
236     const __m256i s0 = _mm256_load_si256((const __m256i *)src);
237     const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
238     const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
239     const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
240     const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
241     const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
242     const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
243     const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
244     const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
245     const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
246     const __m256i x2 = _mm256_loadu_si256((const __m256i *)(sec + 32));
247     const __m256i x3 = _mm256_loadu_si256((const __m256i *)(sec + 48));
248     const __m256i avg0 = _mm256_avg_epu16(r0, x0);
249     const __m256i avg1 = _mm256_avg_epu16(r1, x1);
250     const __m256i avg2 = _mm256_avg_epu16(r2, x2);
251     const __m256i avg3 = _mm256_avg_epu16(r3, x3);
252     // absolute differences between every ref/pred avg to src
253     const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
254     const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
255     const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(avg2, s2));
256     const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(avg3, s3));
257     // sum every abs diff
258     *sums_16 =
259         _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
260     *sums_16 =
261         _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
262 
263     src += src_stride;
264     ref += ref_stride;
265     sec += 64;
266   }
267 }
268 
269 #define HIGHBD_SAD64XN_AVG(n)                                                 \
270   unsigned int vpx_highbd_sad64x##n##_avg_avx2(                               \
271       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
272       int ref_stride, const uint8_t *second_pred) {                           \
273     const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                       \
274     uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                             \
275     uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);                         \
276     __m256i sums_32 = _mm256_setzero_si256();                                 \
277     int i;                                                                    \
278                                                                               \
279     for (i = 0; i < (n / 2); ++i) {                                           \
280       __m256i sums_16 = _mm256_setzero_si256();                               \
281                                                                               \
282       highbd_sad64xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 2); \
283                                                                               \
284       /* sums_16 will outrange after 2 rows, so add current sums_16 to        \
285        * sums_32*/                                                            \
286       sums_32 = _mm256_add_epi32(                                             \
287           sums_32,                                                            \
288           _mm256_add_epi32(                                                   \
289               _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),         \
290               _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));  \
291                                                                               \
292       src += src_stride << 1;                                                 \
293       ref += ref_stride << 1;                                                 \
294       sec += 64 << 1;                                                         \
295     }                                                                         \
296     return calc_final(sums_32);                                               \
297   }
298 
299 // 64x64
300 HIGHBD_SAD64XN_AVG(64)
301 
302 // 64x32
303 HIGHBD_SAD64XN_AVG(32)
304 
highbd_sad32xH_avg(__m256i * sums_16,const uint16_t * src,int src_stride,uint16_t * ref,int ref_stride,uint16_t * sec,int height)305 static VPX_FORCE_INLINE void highbd_sad32xH_avg(__m256i *sums_16,
306                                                 const uint16_t *src,
307                                                 int src_stride, uint16_t *ref,
308                                                 int ref_stride, uint16_t *sec,
309                                                 int height) {
310   int i;
311   for (i = 0; i < height; ++i) {
312     // load src and all ref[]
313     const __m256i s0 = _mm256_load_si256((const __m256i *)src);
314     const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
315     const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
316     const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
317     const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
318     const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
319     const __m256i avg0 = _mm256_avg_epu16(r0, x0);
320     const __m256i avg1 = _mm256_avg_epu16(r1, x1);
321     // absolute differences between every ref/pred avg to src
322     const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
323     const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
324     // sum every abs diff
325     *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
326     *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
327 
328     src += src_stride;
329     ref += ref_stride;
330     sec += 32;
331   }
332 }
333 
334 #define HIGHBD_SAD32XN_AVG(n)                                                 \
335   unsigned int vpx_highbd_sad32x##n##_avg_avx2(                               \
336       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
337       int ref_stride, const uint8_t *second_pred) {                           \
338     const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                       \
339     uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                             \
340     uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);                         \
341     __m256i sums_32 = _mm256_setzero_si256();                                 \
342     int i;                                                                    \
343                                                                               \
344     for (i = 0; i < (n / 8); ++i) {                                           \
345       __m256i sums_16 = _mm256_setzero_si256();                               \
346                                                                               \
347       highbd_sad32xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); \
348                                                                               \
349       /* sums_16 will outrange after 8 rows, so add current sums_16 to        \
350        * sums_32*/                                                            \
351       sums_32 = _mm256_add_epi32(                                             \
352           sums_32,                                                            \
353           _mm256_add_epi32(                                                   \
354               _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),         \
355               _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));  \
356                                                                               \
357       src += src_stride << 3;                                                 \
358       ref += ref_stride << 3;                                                 \
359       sec += 32 << 3;                                                         \
360     }                                                                         \
361     return calc_final(sums_32);                                               \
362   }
363 
364 // 32x64
365 HIGHBD_SAD32XN_AVG(64)
366 
367 // 32x32
368 HIGHBD_SAD32XN_AVG(32)
369 
370 // 32x16
371 HIGHBD_SAD32XN_AVG(16)
372 
highbd_sad16xH_avg(__m256i * sums_16,const uint16_t * src,int src_stride,uint16_t * ref,int ref_stride,uint16_t * sec,int height)373 static VPX_FORCE_INLINE void highbd_sad16xH_avg(__m256i *sums_16,
374                                                 const uint16_t *src,
375                                                 int src_stride, uint16_t *ref,
376                                                 int ref_stride, uint16_t *sec,
377                                                 int height) {
378   int i;
379   for (i = 0; i < height; i += 2) {
380     // load src and all ref[]
381     const __m256i s0 = _mm256_load_si256((const __m256i *)src);
382     const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
383     const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
384     const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
385     const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
386     const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
387     const __m256i avg0 = _mm256_avg_epu16(r0, x0);
388     const __m256i avg1 = _mm256_avg_epu16(r1, x1);
389     // absolute differences between every ref[] to src
390     const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
391     const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
392     // sum every abs diff
393     *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
394     *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
395 
396     src += src_stride << 1;
397     ref += ref_stride << 1;
398     sec += 32;
399   }
400 }
401 
vpx_highbd_sad16x32_avg_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,const uint8_t * second_pred)402 unsigned int vpx_highbd_sad16x32_avg_avx2(const uint8_t *src_ptr,
403                                           int src_stride,
404                                           const uint8_t *ref_ptr,
405                                           int ref_stride,
406                                           const uint8_t *second_pred) {
407   const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
408   uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
409   uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
410   __m256i sums_32 = _mm256_setzero_si256();
411   int i;
412 
413   for (i = 0; i < 2; ++i) {
414     __m256i sums_16 = _mm256_setzero_si256();
415 
416     highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
417 
418     // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
419     sums_32 = _mm256_add_epi32(
420         sums_32,
421         _mm256_add_epi32(
422             _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
423             _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
424 
425     src += src_stride << 4;
426     ref += ref_stride << 4;
427     sec += 16 << 4;
428   }
429   return calc_final(sums_32);
430 }
431 
vpx_highbd_sad16x16_avg_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,const uint8_t * second_pred)432 unsigned int vpx_highbd_sad16x16_avg_avx2(const uint8_t *src_ptr,
433                                           int src_stride,
434                                           const uint8_t *ref_ptr,
435                                           int ref_stride,
436                                           const uint8_t *second_pred) {
437   const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
438   uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
439   uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
440   __m256i sums_16 = _mm256_setzero_si256();
441 
442   highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
443 
444   {
445     const __m256i sums_32 = _mm256_add_epi32(
446         _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
447         _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
448     return calc_final(sums_32);
449   }
450 }
451 
vpx_highbd_sad16x8_avg_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,const uint8_t * second_pred)452 unsigned int vpx_highbd_sad16x8_avg_avx2(const uint8_t *src_ptr, int src_stride,
453                                          const uint8_t *ref_ptr, int ref_stride,
454                                          const uint8_t *second_pred) {
455   const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
456   uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
457   uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
458   __m256i sums_16 = _mm256_setzero_si256();
459 
460   highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8);
461 
462   {
463     const __m256i sums_32 = _mm256_add_epi32(
464         _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
465         _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
466     return calc_final(sums_32);
467   }
468 }
469