1 /*
2 * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10 #include <immintrin.h>
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx/vpx_integer.h"
13
calc_final(const __m256i sums_32)14 static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) {
15 const __m256i t0 = _mm256_add_epi32(sums_32, _mm256_srli_si256(sums_32, 8));
16 const __m256i t1 = _mm256_add_epi32(t0, _mm256_srli_si256(t0, 4));
17 const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t1),
18 _mm256_extractf128_si256(t1, 1));
19 return (unsigned int)_mm_cvtsi128_si32(sum);
20 }
21
highbd_sad64xH(__m256i * sums_16,const uint16_t * src,int src_stride,uint16_t * ref,int ref_stride,int height)22 static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16,
23 const uint16_t *src, int src_stride,
24 uint16_t *ref, int ref_stride,
25 int height) {
26 int i;
27 for (i = 0; i < height; ++i) {
28 // load src and all ref[]
29 const __m256i s0 = _mm256_load_si256((const __m256i *)src);
30 const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
31 const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
32 const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
33 const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
34 const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
35 const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
36 const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
37 // absolute differences between every ref[] to src
38 const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
39 const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
40 const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(r2, s2));
41 const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(r3, s3));
42 // sum every abs diff
43 *sums_16 =
44 _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
45 *sums_16 =
46 _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
47
48 src += src_stride;
49 ref += ref_stride;
50 }
51 }
52
53 #define HIGHBD_SAD64XN(n) \
54 unsigned int vpx_highbd_sad64x##n##_avx2( \
55 const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
56 int ref_stride) { \
57 const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
58 uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
59 __m256i sums_32 = _mm256_setzero_si256(); \
60 int i; \
61 \
62 for (i = 0; i < (n / 2); ++i) { \
63 __m256i sums_16 = _mm256_setzero_si256(); \
64 \
65 highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2); \
66 \
67 /* sums_16 will outrange after 2 rows, so add current sums_16 to \
68 * sums_32*/ \
69 sums_32 = _mm256_add_epi32( \
70 sums_32, \
71 _mm256_add_epi32( \
72 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
73 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
74 \
75 src += src_stride << 1; \
76 ref += ref_stride << 1; \
77 } \
78 return calc_final(sums_32); \
79 }
80
81 // 64x64
82 HIGHBD_SAD64XN(64)
83
84 // 64x32
85 HIGHBD_SAD64XN(32)
86
highbd_sad32xH(__m256i * sums_16,const uint16_t * src,int src_stride,uint16_t * ref,int ref_stride,int height)87 static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
88 const uint16_t *src, int src_stride,
89 uint16_t *ref, int ref_stride,
90 int height) {
91 int i;
92 for (i = 0; i < height; ++i) {
93 // load src and all ref[]
94 const __m256i s0 = _mm256_load_si256((const __m256i *)src);
95 const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
96 const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
97 const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
98 // absolute differences between every ref[] to src
99 const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
100 const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
101 // sum every abs diff
102 *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
103 *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
104
105 src += src_stride;
106 ref += ref_stride;
107 }
108 }
109
110 #define HIGHBD_SAD32XN(n) \
111 unsigned int vpx_highbd_sad32x##n##_avx2( \
112 const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
113 int ref_stride) { \
114 const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
115 uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
116 __m256i sums_32 = _mm256_setzero_si256(); \
117 int i; \
118 \
119 for (i = 0; i < (n / 8); ++i) { \
120 __m256i sums_16 = _mm256_setzero_si256(); \
121 \
122 highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8); \
123 \
124 /* sums_16 will outrange after 8 rows, so add current sums_16 to \
125 * sums_32*/ \
126 sums_32 = _mm256_add_epi32( \
127 sums_32, \
128 _mm256_add_epi32( \
129 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
130 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
131 \
132 src += src_stride << 3; \
133 ref += ref_stride << 3; \
134 } \
135 return calc_final(sums_32); \
136 }
137
138 // 32x64
139 HIGHBD_SAD32XN(64)
140
141 // 32x32
142 HIGHBD_SAD32XN(32)
143
144 // 32x16
145 HIGHBD_SAD32XN(16)
146
highbd_sad16xH(__m256i * sums_16,const uint16_t * src,int src_stride,uint16_t * ref,int ref_stride,int height)147 static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
148 const uint16_t *src, int src_stride,
149 uint16_t *ref, int ref_stride,
150 int height) {
151 int i;
152 for (i = 0; i < height; i += 2) {
153 // load src and all ref[]
154 const __m256i s0 = _mm256_load_si256((const __m256i *)src);
155 const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
156 const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
157 const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
158 // absolute differences between every ref[] to src
159 const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
160 const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
161 // sum every abs diff
162 *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
163 *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
164
165 src += src_stride << 1;
166 ref += ref_stride << 1;
167 }
168 }
169
vpx_highbd_sad16x32_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride)170 unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride,
171 const uint8_t *ref_ptr, int ref_stride) {
172 const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
173 uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
174 __m256i sums_32 = _mm256_setzero_si256();
175 int i;
176
177 for (i = 0; i < 2; ++i) {
178 __m256i sums_16 = _mm256_setzero_si256();
179
180 highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
181
182 // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
183 sums_32 = _mm256_add_epi32(
184 sums_32,
185 _mm256_add_epi32(
186 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
187 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
188
189 src += src_stride << 4;
190 ref += ref_stride << 4;
191 }
192 return calc_final(sums_32);
193 }
194
vpx_highbd_sad16x16_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride)195 unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride,
196 const uint8_t *ref_ptr, int ref_stride) {
197 const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
198 uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
199 __m256i sums_16 = _mm256_setzero_si256();
200
201 highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
202
203 {
204 const __m256i sums_32 = _mm256_add_epi32(
205 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
206 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
207 return calc_final(sums_32);
208 }
209 }
210
vpx_highbd_sad16x8_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride)211 unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride,
212 const uint8_t *ref_ptr, int ref_stride) {
213 const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
214 uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
215 __m256i sums_16 = _mm256_setzero_si256();
216
217 highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8);
218
219 {
220 const __m256i sums_32 = _mm256_add_epi32(
221 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
222 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
223 return calc_final(sums_32);
224 }
225 }
226
227 // AVG -------------------------------------------------------------------------
highbd_sad64xH_avg(__m256i * sums_16,const uint16_t * src,int src_stride,uint16_t * ref,int ref_stride,uint16_t * sec,int height)228 static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16,
229 const uint16_t *src,
230 int src_stride, uint16_t *ref,
231 int ref_stride, uint16_t *sec,
232 int height) {
233 int i;
234 for (i = 0; i < height; ++i) {
235 // load src and all ref[]
236 const __m256i s0 = _mm256_load_si256((const __m256i *)src);
237 const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
238 const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
239 const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
240 const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
241 const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
242 const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
243 const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
244 const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
245 const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
246 const __m256i x2 = _mm256_loadu_si256((const __m256i *)(sec + 32));
247 const __m256i x3 = _mm256_loadu_si256((const __m256i *)(sec + 48));
248 const __m256i avg0 = _mm256_avg_epu16(r0, x0);
249 const __m256i avg1 = _mm256_avg_epu16(r1, x1);
250 const __m256i avg2 = _mm256_avg_epu16(r2, x2);
251 const __m256i avg3 = _mm256_avg_epu16(r3, x3);
252 // absolute differences between every ref/pred avg to src
253 const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
254 const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
255 const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(avg2, s2));
256 const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(avg3, s3));
257 // sum every abs diff
258 *sums_16 =
259 _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
260 *sums_16 =
261 _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
262
263 src += src_stride;
264 ref += ref_stride;
265 sec += 64;
266 }
267 }
268
269 #define HIGHBD_SAD64XN_AVG(n) \
270 unsigned int vpx_highbd_sad64x##n##_avg_avx2( \
271 const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
272 int ref_stride, const uint8_t *second_pred) { \
273 const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
274 uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
275 uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \
276 __m256i sums_32 = _mm256_setzero_si256(); \
277 int i; \
278 \
279 for (i = 0; i < (n / 2); ++i) { \
280 __m256i sums_16 = _mm256_setzero_si256(); \
281 \
282 highbd_sad64xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 2); \
283 \
284 /* sums_16 will outrange after 2 rows, so add current sums_16 to \
285 * sums_32*/ \
286 sums_32 = _mm256_add_epi32( \
287 sums_32, \
288 _mm256_add_epi32( \
289 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
290 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
291 \
292 src += src_stride << 1; \
293 ref += ref_stride << 1; \
294 sec += 64 << 1; \
295 } \
296 return calc_final(sums_32); \
297 }
298
299 // 64x64
300 HIGHBD_SAD64XN_AVG(64)
301
302 // 64x32
303 HIGHBD_SAD64XN_AVG(32)
304
highbd_sad32xH_avg(__m256i * sums_16,const uint16_t * src,int src_stride,uint16_t * ref,int ref_stride,uint16_t * sec,int height)305 static VPX_FORCE_INLINE void highbd_sad32xH_avg(__m256i *sums_16,
306 const uint16_t *src,
307 int src_stride, uint16_t *ref,
308 int ref_stride, uint16_t *sec,
309 int height) {
310 int i;
311 for (i = 0; i < height; ++i) {
312 // load src and all ref[]
313 const __m256i s0 = _mm256_load_si256((const __m256i *)src);
314 const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
315 const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
316 const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
317 const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
318 const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
319 const __m256i avg0 = _mm256_avg_epu16(r0, x0);
320 const __m256i avg1 = _mm256_avg_epu16(r1, x1);
321 // absolute differences between every ref/pred avg to src
322 const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
323 const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
324 // sum every abs diff
325 *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
326 *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
327
328 src += src_stride;
329 ref += ref_stride;
330 sec += 32;
331 }
332 }
333
334 #define HIGHBD_SAD32XN_AVG(n) \
335 unsigned int vpx_highbd_sad32x##n##_avg_avx2( \
336 const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
337 int ref_stride, const uint8_t *second_pred) { \
338 const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
339 uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
340 uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \
341 __m256i sums_32 = _mm256_setzero_si256(); \
342 int i; \
343 \
344 for (i = 0; i < (n / 8); ++i) { \
345 __m256i sums_16 = _mm256_setzero_si256(); \
346 \
347 highbd_sad32xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); \
348 \
349 /* sums_16 will outrange after 8 rows, so add current sums_16 to \
350 * sums_32*/ \
351 sums_32 = _mm256_add_epi32( \
352 sums_32, \
353 _mm256_add_epi32( \
354 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
355 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
356 \
357 src += src_stride << 3; \
358 ref += ref_stride << 3; \
359 sec += 32 << 3; \
360 } \
361 return calc_final(sums_32); \
362 }
363
364 // 32x64
365 HIGHBD_SAD32XN_AVG(64)
366
367 // 32x32
368 HIGHBD_SAD32XN_AVG(32)
369
370 // 32x16
371 HIGHBD_SAD32XN_AVG(16)
372
highbd_sad16xH_avg(__m256i * sums_16,const uint16_t * src,int src_stride,uint16_t * ref,int ref_stride,uint16_t * sec,int height)373 static VPX_FORCE_INLINE void highbd_sad16xH_avg(__m256i *sums_16,
374 const uint16_t *src,
375 int src_stride, uint16_t *ref,
376 int ref_stride, uint16_t *sec,
377 int height) {
378 int i;
379 for (i = 0; i < height; i += 2) {
380 // load src and all ref[]
381 const __m256i s0 = _mm256_load_si256((const __m256i *)src);
382 const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
383 const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
384 const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
385 const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
386 const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
387 const __m256i avg0 = _mm256_avg_epu16(r0, x0);
388 const __m256i avg1 = _mm256_avg_epu16(r1, x1);
389 // absolute differences between every ref[] to src
390 const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
391 const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
392 // sum every abs diff
393 *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
394 *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
395
396 src += src_stride << 1;
397 ref += ref_stride << 1;
398 sec += 32;
399 }
400 }
401
vpx_highbd_sad16x32_avg_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,const uint8_t * second_pred)402 unsigned int vpx_highbd_sad16x32_avg_avx2(const uint8_t *src_ptr,
403 int src_stride,
404 const uint8_t *ref_ptr,
405 int ref_stride,
406 const uint8_t *second_pred) {
407 const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
408 uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
409 uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
410 __m256i sums_32 = _mm256_setzero_si256();
411 int i;
412
413 for (i = 0; i < 2; ++i) {
414 __m256i sums_16 = _mm256_setzero_si256();
415
416 highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
417
418 // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
419 sums_32 = _mm256_add_epi32(
420 sums_32,
421 _mm256_add_epi32(
422 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
423 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
424
425 src += src_stride << 4;
426 ref += ref_stride << 4;
427 sec += 16 << 4;
428 }
429 return calc_final(sums_32);
430 }
431
vpx_highbd_sad16x16_avg_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,const uint8_t * second_pred)432 unsigned int vpx_highbd_sad16x16_avg_avx2(const uint8_t *src_ptr,
433 int src_stride,
434 const uint8_t *ref_ptr,
435 int ref_stride,
436 const uint8_t *second_pred) {
437 const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
438 uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
439 uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
440 __m256i sums_16 = _mm256_setzero_si256();
441
442 highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
443
444 {
445 const __m256i sums_32 = _mm256_add_epi32(
446 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
447 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
448 return calc_final(sums_32);
449 }
450 }
451
vpx_highbd_sad16x8_avg_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,const uint8_t * second_pred)452 unsigned int vpx_highbd_sad16x8_avg_avx2(const uint8_t *src_ptr, int src_stride,
453 const uint8_t *ref_ptr, int ref_stride,
454 const uint8_t *second_pred) {
455 const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
456 uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
457 uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
458 __m256i sums_16 = _mm256_setzero_si256();
459
460 highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8);
461
462 {
463 const __m256i sums_32 = _mm256_add_epi32(
464 _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
465 _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
466 return calc_final(sums_32);
467 }
468 }
469