• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 #include <immintrin.h>  // AVX2
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx/vpx_integer.h"
13 
calc_final(const __m256i * const sums,uint32_t sad_array[4])14 static INLINE void calc_final(const __m256i *const sums /*[4]*/,
15                               uint32_t sad_array[4]) {
16   const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
17   const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
18   const __m256i t2 = _mm256_hadd_epi32(t0, t1);
19   const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),
20                                     _mm256_extractf128_si256(t2, 1));
21   _mm_storeu_si128((__m128i *)sad_array, sum);
22 }
23 
vpx_sad32x32x4d_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t sad_array[4])24 void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
25                           const uint8_t *const ref_array[4], int ref_stride,
26                           uint32_t sad_array[4]) {
27   int i;
28   const uint8_t *refs[4];
29   __m256i sums[4];
30 
31   refs[0] = ref_array[0];
32   refs[1] = ref_array[1];
33   refs[2] = ref_array[2];
34   refs[3] = ref_array[3];
35   sums[0] = _mm256_setzero_si256();
36   sums[1] = _mm256_setzero_si256();
37   sums[2] = _mm256_setzero_si256();
38   sums[3] = _mm256_setzero_si256();
39 
40   for (i = 0; i < 32; i++) {
41     __m256i r[4];
42 
43     // load src and all ref[]
44     const __m256i s = _mm256_load_si256((const __m256i *)src_ptr);
45     r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
46     r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
47     r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
48     r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
49 
50     // sum of the absolute differences between every ref[] to src
51     r[0] = _mm256_sad_epu8(r[0], s);
52     r[1] = _mm256_sad_epu8(r[1], s);
53     r[2] = _mm256_sad_epu8(r[2], s);
54     r[3] = _mm256_sad_epu8(r[3], s);
55 
56     // sum every ref[]
57     sums[0] = _mm256_add_epi32(sums[0], r[0]);
58     sums[1] = _mm256_add_epi32(sums[1], r[1]);
59     sums[2] = _mm256_add_epi32(sums[2], r[2]);
60     sums[3] = _mm256_add_epi32(sums[3], r[3]);
61 
62     src_ptr += src_stride;
63     refs[0] += ref_stride;
64     refs[1] += ref_stride;
65     refs[2] += ref_stride;
66     refs[3] += ref_stride;
67   }
68 
69   calc_final(sums, sad_array);
70 }
71 
vpx_sad64x64x4d_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t sad_array[4])72 void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
73                           const uint8_t *const ref_array[4], int ref_stride,
74                           uint32_t sad_array[4]) {
75   __m256i sums[4];
76   int i;
77   const uint8_t *refs[4];
78 
79   refs[0] = ref_array[0];
80   refs[1] = ref_array[1];
81   refs[2] = ref_array[2];
82   refs[3] = ref_array[3];
83   sums[0] = _mm256_setzero_si256();
84   sums[1] = _mm256_setzero_si256();
85   sums[2] = _mm256_setzero_si256();
86   sums[3] = _mm256_setzero_si256();
87 
88   for (i = 0; i < 64; i++) {
89     __m256i r_lo[4], r_hi[4];
90     // load 64 bytes from src and all ref[]
91     const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr);
92     const __m256i s_hi = _mm256_load_si256((const __m256i *)(src_ptr + 32));
93     r_lo[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
94     r_hi[0] = _mm256_loadu_si256((const __m256i *)(refs[0] + 32));
95     r_lo[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
96     r_hi[1] = _mm256_loadu_si256((const __m256i *)(refs[1] + 32));
97     r_lo[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
98     r_hi[2] = _mm256_loadu_si256((const __m256i *)(refs[2] + 32));
99     r_lo[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
100     r_hi[3] = _mm256_loadu_si256((const __m256i *)(refs[3] + 32));
101 
102     // sum of the absolute differences between every ref[] to src
103     r_lo[0] = _mm256_sad_epu8(r_lo[0], s_lo);
104     r_lo[1] = _mm256_sad_epu8(r_lo[1], s_lo);
105     r_lo[2] = _mm256_sad_epu8(r_lo[2], s_lo);
106     r_lo[3] = _mm256_sad_epu8(r_lo[3], s_lo);
107     r_hi[0] = _mm256_sad_epu8(r_hi[0], s_hi);
108     r_hi[1] = _mm256_sad_epu8(r_hi[1], s_hi);
109     r_hi[2] = _mm256_sad_epu8(r_hi[2], s_hi);
110     r_hi[3] = _mm256_sad_epu8(r_hi[3], s_hi);
111 
112     // sum every ref[]
113     sums[0] = _mm256_add_epi32(sums[0], r_lo[0]);
114     sums[1] = _mm256_add_epi32(sums[1], r_lo[1]);
115     sums[2] = _mm256_add_epi32(sums[2], r_lo[2]);
116     sums[3] = _mm256_add_epi32(sums[3], r_lo[3]);
117     sums[0] = _mm256_add_epi32(sums[0], r_hi[0]);
118     sums[1] = _mm256_add_epi32(sums[1], r_hi[1]);
119     sums[2] = _mm256_add_epi32(sums[2], r_hi[2]);
120     sums[3] = _mm256_add_epi32(sums[3], r_hi[3]);
121 
122     src_ptr += src_stride;
123     refs[0] += ref_stride;
124     refs[1] += ref_stride;
125     refs[2] += ref_stride;
126     refs[3] += ref_stride;
127   }
128 
129   calc_final(sums, sad_array);
130 }
131