1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10 #include <immintrin.h> // AVX2
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx/vpx_integer.h"
13
calc_final(const __m256i * const sums,uint32_t sad_array[4])14 static INLINE void calc_final(const __m256i *const sums /*[4]*/,
15 uint32_t sad_array[4]) {
16 const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
17 const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
18 const __m256i t2 = _mm256_hadd_epi32(t0, t1);
19 const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),
20 _mm256_extractf128_si256(t2, 1));
21 _mm_storeu_si128((__m128i *)sad_array, sum);
22 }
23
vpx_sad32x32x4d_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t sad_array[4])24 void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
25 const uint8_t *const ref_array[4], int ref_stride,
26 uint32_t sad_array[4]) {
27 int i;
28 const uint8_t *refs[4];
29 __m256i sums[4];
30
31 refs[0] = ref_array[0];
32 refs[1] = ref_array[1];
33 refs[2] = ref_array[2];
34 refs[3] = ref_array[3];
35 sums[0] = _mm256_setzero_si256();
36 sums[1] = _mm256_setzero_si256();
37 sums[2] = _mm256_setzero_si256();
38 sums[3] = _mm256_setzero_si256();
39
40 for (i = 0; i < 32; i++) {
41 __m256i r[4];
42
43 // load src and all ref[]
44 const __m256i s = _mm256_load_si256((const __m256i *)src_ptr);
45 r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
46 r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
47 r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
48 r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
49
50 // sum of the absolute differences between every ref[] to src
51 r[0] = _mm256_sad_epu8(r[0], s);
52 r[1] = _mm256_sad_epu8(r[1], s);
53 r[2] = _mm256_sad_epu8(r[2], s);
54 r[3] = _mm256_sad_epu8(r[3], s);
55
56 // sum every ref[]
57 sums[0] = _mm256_add_epi32(sums[0], r[0]);
58 sums[1] = _mm256_add_epi32(sums[1], r[1]);
59 sums[2] = _mm256_add_epi32(sums[2], r[2]);
60 sums[3] = _mm256_add_epi32(sums[3], r[3]);
61
62 src_ptr += src_stride;
63 refs[0] += ref_stride;
64 refs[1] += ref_stride;
65 refs[2] += ref_stride;
66 refs[3] += ref_stride;
67 }
68
69 calc_final(sums, sad_array);
70 }
71
vpx_sad64x64x4d_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t sad_array[4])72 void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
73 const uint8_t *const ref_array[4], int ref_stride,
74 uint32_t sad_array[4]) {
75 __m256i sums[4];
76 int i;
77 const uint8_t *refs[4];
78
79 refs[0] = ref_array[0];
80 refs[1] = ref_array[1];
81 refs[2] = ref_array[2];
82 refs[3] = ref_array[3];
83 sums[0] = _mm256_setzero_si256();
84 sums[1] = _mm256_setzero_si256();
85 sums[2] = _mm256_setzero_si256();
86 sums[3] = _mm256_setzero_si256();
87
88 for (i = 0; i < 64; i++) {
89 __m256i r_lo[4], r_hi[4];
90 // load 64 bytes from src and all ref[]
91 const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr);
92 const __m256i s_hi = _mm256_load_si256((const __m256i *)(src_ptr + 32));
93 r_lo[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
94 r_hi[0] = _mm256_loadu_si256((const __m256i *)(refs[0] + 32));
95 r_lo[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
96 r_hi[1] = _mm256_loadu_si256((const __m256i *)(refs[1] + 32));
97 r_lo[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
98 r_hi[2] = _mm256_loadu_si256((const __m256i *)(refs[2] + 32));
99 r_lo[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
100 r_hi[3] = _mm256_loadu_si256((const __m256i *)(refs[3] + 32));
101
102 // sum of the absolute differences between every ref[] to src
103 r_lo[0] = _mm256_sad_epu8(r_lo[0], s_lo);
104 r_lo[1] = _mm256_sad_epu8(r_lo[1], s_lo);
105 r_lo[2] = _mm256_sad_epu8(r_lo[2], s_lo);
106 r_lo[3] = _mm256_sad_epu8(r_lo[3], s_lo);
107 r_hi[0] = _mm256_sad_epu8(r_hi[0], s_hi);
108 r_hi[1] = _mm256_sad_epu8(r_hi[1], s_hi);
109 r_hi[2] = _mm256_sad_epu8(r_hi[2], s_hi);
110 r_hi[3] = _mm256_sad_epu8(r_hi[3], s_hi);
111
112 // sum every ref[]
113 sums[0] = _mm256_add_epi32(sums[0], r_lo[0]);
114 sums[1] = _mm256_add_epi32(sums[1], r_lo[1]);
115 sums[2] = _mm256_add_epi32(sums[2], r_lo[2]);
116 sums[3] = _mm256_add_epi32(sums[3], r_lo[3]);
117 sums[0] = _mm256_add_epi32(sums[0], r_hi[0]);
118 sums[1] = _mm256_add_epi32(sums[1], r_hi[1]);
119 sums[2] = _mm256_add_epi32(sums[2], r_hi[2]);
120 sums[3] = _mm256_add_epi32(sums[3], r_hi[3]);
121
122 src_ptr += src_stride;
123 refs[0] += ref_stride;
124 refs[1] += ref_stride;
125 refs[2] += ref_stride;
126 refs[3] += ref_stride;
127 }
128
129 calc_final(sums, sad_array);
130 }
131