• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <immintrin.h>
13 
14 #include "config/aom_dsp_rtcd.h"
15 #include "aom/aom_integer.h"
16 #include "aom_dsp/x86/bitdepth_conversion_sse2.h"
17 #include "aom_dsp/x86/mem_sse2.h"
18 #include "aom_ports/mem.h"
19 
sign_extend_16bit_to_32bit_sse2(__m128i in,__m128i zero,__m128i * out_lo,__m128i * out_hi)20 static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero,
21                                                    __m128i *out_lo,
22                                                    __m128i *out_hi) {
23   const __m128i sign_bits = _mm_cmplt_epi16(in, zero);
24   *out_lo = _mm_unpacklo_epi16(in, sign_bits);
25   *out_hi = _mm_unpackhi_epi16(in, sign_bits);
26 }
27 
invert_sign_32_sse2(__m128i a,__m128i sign)28 static INLINE __m128i invert_sign_32_sse2(__m128i a, __m128i sign) {
29   a = _mm_xor_si128(a, sign);
30   return _mm_sub_epi32(a, sign);
31 }
32 
aom_minmax_8x8_sse2(const uint8_t * s,int p,const uint8_t * d,int dp,int * min,int * max)33 void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
34                          int *min, int *max) {
35   __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
36   u0 = _mm_setzero_si128();
37   // Row 0
38   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
39   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
40   diff = _mm_subs_epi16(s0, d0);
41   negdiff = _mm_subs_epi16(u0, diff);
42   absdiff0 = _mm_max_epi16(diff, negdiff);
43   // Row 1
44   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
45   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
46   diff = _mm_subs_epi16(s0, d0);
47   negdiff = _mm_subs_epi16(u0, diff);
48   absdiff = _mm_max_epi16(diff, negdiff);
49   maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
50   minabsdiff = _mm_min_epi16(absdiff0, absdiff);
51   // Row 2
52   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
53   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
54   diff = _mm_subs_epi16(s0, d0);
55   negdiff = _mm_subs_epi16(u0, diff);
56   absdiff = _mm_max_epi16(diff, negdiff);
57   maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
58   minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
59   // Row 3
60   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
61   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
62   diff = _mm_subs_epi16(s0, d0);
63   negdiff = _mm_subs_epi16(u0, diff);
64   absdiff = _mm_max_epi16(diff, negdiff);
65   maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
66   minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
67   // Row 4
68   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
69   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
70   diff = _mm_subs_epi16(s0, d0);
71   negdiff = _mm_subs_epi16(u0, diff);
72   absdiff = _mm_max_epi16(diff, negdiff);
73   maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
74   minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
75   // Row 5
76   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
77   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
78   diff = _mm_subs_epi16(s0, d0);
79   negdiff = _mm_subs_epi16(u0, diff);
80   absdiff = _mm_max_epi16(diff, negdiff);
81   maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
82   minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
83   // Row 6
84   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
85   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
86   diff = _mm_subs_epi16(s0, d0);
87   negdiff = _mm_subs_epi16(u0, diff);
88   absdiff = _mm_max_epi16(diff, negdiff);
89   maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
90   minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
91   // Row 7
92   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
93   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
94   diff = _mm_subs_epi16(s0, d0);
95   negdiff = _mm_subs_epi16(u0, diff);
96   absdiff = _mm_max_epi16(diff, negdiff);
97   maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
98   minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
99 
100   maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
101   maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
102   maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
103   *max = _mm_extract_epi16(maxabsdiff, 0);
104 
105   minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
106   minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
107   minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
108   *min = _mm_extract_epi16(minabsdiff, 0);
109 }
110 
aom_avg_8x8_sse2(const uint8_t * s,int p)111 unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) {
112   __m128i sum0, sum1, s0, s1, s2, s3, u0;
113   unsigned int avg = 0;
114   u0 = _mm_setzero_si128();
115   s0 = loadh_epi64((const __m128i *)(s + p),
116                    _mm_loadl_epi64((const __m128i *)(s)));
117   s1 = loadh_epi64((const __m128i *)(s + 3 * p),
118                    _mm_loadl_epi64((const __m128i *)(s + 2 * p)));
119   s2 = loadh_epi64((const __m128i *)(s + 5 * p),
120                    _mm_loadl_epi64((const __m128i *)(s + 4 * p)));
121   s3 = loadh_epi64((const __m128i *)(s + 7 * p),
122                    _mm_loadl_epi64((const __m128i *)(s + 6 * p)));
123   s0 = _mm_sad_epu8(s0, u0);
124   s1 = _mm_sad_epu8(s1, u0);
125   s2 = _mm_sad_epu8(s2, u0);
126   s3 = _mm_sad_epu8(s3, u0);
127 
128   sum0 = _mm_add_epi16(s0, s1);
129   sum1 = _mm_add_epi16(s2, s3);
130   sum0 = _mm_add_epi16(sum0, sum1);
131   sum0 = _mm_add_epi16(sum0, _mm_srli_si128(sum0, 8));
132   avg = _mm_cvtsi128_si32(sum0);
133   return (avg + 32) >> 6;
134 }
135 
calc_avg_8x8_dual_sse2(const uint8_t * s,int p,int * avg)136 void calc_avg_8x8_dual_sse2(const uint8_t *s, int p, int *avg) {
137   __m128i sum0, sum1, s0, s1, s2, s3, u0;
138   u0 = _mm_setzero_si128();
139   s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s)), u0);
140   s1 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + p)), u0);
141   s2 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 2 * p)), u0);
142   s3 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 3 * p)), u0);
143   sum0 = _mm_add_epi16(s0, s1);
144   sum1 = _mm_add_epi16(s2, s3);
145   s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 4 * p)), u0);
146   s1 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 5 * p)), u0);
147   s2 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 6 * p)), u0);
148   s3 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 7 * p)), u0);
149   sum0 = _mm_add_epi16(sum0, _mm_add_epi16(s0, s1));
150   sum1 = _mm_add_epi16(sum1, _mm_add_epi16(s2, s3));
151   sum0 = _mm_add_epi16(sum0, sum1);
152 
153   // (avg + 32) >> 6
154   __m128i rounding = _mm_set1_epi32(32);
155   sum0 = _mm_add_epi32(sum0, rounding);
156   sum0 = _mm_srli_epi32(sum0, 6);
157   avg[0] = _mm_cvtsi128_si32(sum0);
158   avg[1] = _mm_extract_epi16(sum0, 4);
159 }
160 
aom_avg_8x8_quad_sse2(const uint8_t * s,int p,int x16_idx,int y16_idx,int * avg)161 void aom_avg_8x8_quad_sse2(const uint8_t *s, int p, int x16_idx, int y16_idx,
162                            int *avg) {
163   const uint8_t *s_ptr = s + y16_idx * p + x16_idx;
164   for (int k = 0; k < 2; k++) {
165     calc_avg_8x8_dual_sse2(s_ptr, p, avg + k * 2);
166     s_ptr += 8 * p;
167   }
168 }
169 
aom_avg_4x4_sse2(const uint8_t * s,int p)170 unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) {
171   __m128i s0, s1, u0;
172   unsigned int avg = 0;
173   u0 = _mm_setzero_si128();
174   s0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s)),
175                           _mm_cvtsi32_si128(*(const int *)(s + p)));
176   s1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s + p * 2)),
177                           _mm_cvtsi32_si128(*(const int *)(s + p * 3)));
178   s0 = _mm_sad_epu8(s0, u0);
179   s1 = _mm_sad_epu8(s1, u0);
180   s0 = _mm_add_epi16(s0, s1);
181   avg = _mm_cvtsi128_si32(s0);
182   return (avg + 8) >> 4;
183 }
184 
hadamard_col4_sse2(__m128i * in,int iter)185 static INLINE void hadamard_col4_sse2(__m128i *in, int iter) {
186   const __m128i a0 = in[0];
187   const __m128i a1 = in[1];
188   const __m128i a2 = in[2];
189   const __m128i a3 = in[3];
190   const __m128i b0 = _mm_srai_epi16(_mm_add_epi16(a0, a1), 1);
191   const __m128i b1 = _mm_srai_epi16(_mm_sub_epi16(a0, a1), 1);
192   const __m128i b2 = _mm_srai_epi16(_mm_add_epi16(a2, a3), 1);
193   const __m128i b3 = _mm_srai_epi16(_mm_sub_epi16(a2, a3), 1);
194   in[0] = _mm_add_epi16(b0, b2);
195   in[1] = _mm_add_epi16(b1, b3);
196   in[2] = _mm_sub_epi16(b0, b2);
197   in[3] = _mm_sub_epi16(b1, b3);
198 
199   if (iter == 0) {
200     const __m128i ba = _mm_unpacklo_epi16(in[0], in[1]);
201     const __m128i dc = _mm_unpacklo_epi16(in[2], in[3]);
202     const __m128i dcba_lo = _mm_unpacklo_epi32(ba, dc);
203     const __m128i dcba_hi = _mm_unpackhi_epi32(ba, dc);
204     in[0] = dcba_lo;
205     in[1] = _mm_srli_si128(dcba_lo, 8);
206     in[2] = dcba_hi;
207     in[3] = _mm_srli_si128(dcba_hi, 8);
208   }
209 }
210 
aom_hadamard_4x4_sse2(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff)211 void aom_hadamard_4x4_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
212                            tran_low_t *coeff) {
213   __m128i src[4];
214   src[0] = _mm_loadl_epi64((const __m128i *)src_diff);
215   src[1] = _mm_loadl_epi64((const __m128i *)(src_diff += src_stride));
216   src[2] = _mm_loadl_epi64((const __m128i *)(src_diff += src_stride));
217   src[3] = _mm_loadl_epi64((const __m128i *)(src_diff + src_stride));
218 
219   hadamard_col4_sse2(src, 0);
220   hadamard_col4_sse2(src, 1);
221 
222   store_tran_low(_mm_unpacklo_epi64(src[0], src[1]), coeff);
223   coeff += 8;
224   store_tran_low(_mm_unpacklo_epi64(src[2], src[3]), coeff);
225 }
226 
hadamard_col8_sse2(__m128i * in,int iter)227 static INLINE void hadamard_col8_sse2(__m128i *in, int iter) {
228   __m128i a0 = in[0];
229   __m128i a1 = in[1];
230   __m128i a2 = in[2];
231   __m128i a3 = in[3];
232   __m128i a4 = in[4];
233   __m128i a5 = in[5];
234   __m128i a6 = in[6];
235   __m128i a7 = in[7];
236 
237   __m128i b0 = _mm_add_epi16(a0, a1);
238   __m128i b1 = _mm_sub_epi16(a0, a1);
239   __m128i b2 = _mm_add_epi16(a2, a3);
240   __m128i b3 = _mm_sub_epi16(a2, a3);
241   __m128i b4 = _mm_add_epi16(a4, a5);
242   __m128i b5 = _mm_sub_epi16(a4, a5);
243   __m128i b6 = _mm_add_epi16(a6, a7);
244   __m128i b7 = _mm_sub_epi16(a6, a7);
245 
246   a0 = _mm_add_epi16(b0, b2);
247   a1 = _mm_add_epi16(b1, b3);
248   a2 = _mm_sub_epi16(b0, b2);
249   a3 = _mm_sub_epi16(b1, b3);
250   a4 = _mm_add_epi16(b4, b6);
251   a5 = _mm_add_epi16(b5, b7);
252   a6 = _mm_sub_epi16(b4, b6);
253   a7 = _mm_sub_epi16(b5, b7);
254 
255   if (iter == 0) {
256     b0 = _mm_add_epi16(a0, a4);
257     b7 = _mm_add_epi16(a1, a5);
258     b3 = _mm_add_epi16(a2, a6);
259     b4 = _mm_add_epi16(a3, a7);
260     b2 = _mm_sub_epi16(a0, a4);
261     b6 = _mm_sub_epi16(a1, a5);
262     b1 = _mm_sub_epi16(a2, a6);
263     b5 = _mm_sub_epi16(a3, a7);
264 
265     a0 = _mm_unpacklo_epi16(b0, b1);
266     a1 = _mm_unpacklo_epi16(b2, b3);
267     a2 = _mm_unpackhi_epi16(b0, b1);
268     a3 = _mm_unpackhi_epi16(b2, b3);
269     a4 = _mm_unpacklo_epi16(b4, b5);
270     a5 = _mm_unpacklo_epi16(b6, b7);
271     a6 = _mm_unpackhi_epi16(b4, b5);
272     a7 = _mm_unpackhi_epi16(b6, b7);
273 
274     b0 = _mm_unpacklo_epi32(a0, a1);
275     b1 = _mm_unpacklo_epi32(a4, a5);
276     b2 = _mm_unpackhi_epi32(a0, a1);
277     b3 = _mm_unpackhi_epi32(a4, a5);
278     b4 = _mm_unpacklo_epi32(a2, a3);
279     b5 = _mm_unpacklo_epi32(a6, a7);
280     b6 = _mm_unpackhi_epi32(a2, a3);
281     b7 = _mm_unpackhi_epi32(a6, a7);
282 
283     in[0] = _mm_unpacklo_epi64(b0, b1);
284     in[1] = _mm_unpackhi_epi64(b0, b1);
285     in[2] = _mm_unpacklo_epi64(b2, b3);
286     in[3] = _mm_unpackhi_epi64(b2, b3);
287     in[4] = _mm_unpacklo_epi64(b4, b5);
288     in[5] = _mm_unpackhi_epi64(b4, b5);
289     in[6] = _mm_unpacklo_epi64(b6, b7);
290     in[7] = _mm_unpackhi_epi64(b6, b7);
291   } else {
292     in[0] = _mm_add_epi16(a0, a4);
293     in[7] = _mm_add_epi16(a1, a5);
294     in[3] = _mm_add_epi16(a2, a6);
295     in[4] = _mm_add_epi16(a3, a7);
296     in[2] = _mm_sub_epi16(a0, a4);
297     in[6] = _mm_sub_epi16(a1, a5);
298     in[1] = _mm_sub_epi16(a2, a6);
299     in[5] = _mm_sub_epi16(a3, a7);
300   }
301 }
302 
hadamard_8x8_sse2(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff,int is_final)303 static INLINE void hadamard_8x8_sse2(const int16_t *src_diff,
304                                      ptrdiff_t src_stride, tran_low_t *coeff,
305                                      int is_final) {
306   __m128i src[8];
307   src[0] = _mm_load_si128((const __m128i *)src_diff);
308   src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
309   src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
310   src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
311   src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
312   src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
313   src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
314   src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride));
315 
316   hadamard_col8_sse2(src, 0);
317   hadamard_col8_sse2(src, 1);
318 
319   if (is_final) {
320     store_tran_low(src[0], coeff);
321     coeff += 8;
322     store_tran_low(src[1], coeff);
323     coeff += 8;
324     store_tran_low(src[2], coeff);
325     coeff += 8;
326     store_tran_low(src[3], coeff);
327     coeff += 8;
328     store_tran_low(src[4], coeff);
329     coeff += 8;
330     store_tran_low(src[5], coeff);
331     coeff += 8;
332     store_tran_low(src[6], coeff);
333     coeff += 8;
334     store_tran_low(src[7], coeff);
335   } else {
336     int16_t *coeff16 = (int16_t *)coeff;
337     _mm_store_si128((__m128i *)coeff16, src[0]);
338     coeff16 += 8;
339     _mm_store_si128((__m128i *)coeff16, src[1]);
340     coeff16 += 8;
341     _mm_store_si128((__m128i *)coeff16, src[2]);
342     coeff16 += 8;
343     _mm_store_si128((__m128i *)coeff16, src[3]);
344     coeff16 += 8;
345     _mm_store_si128((__m128i *)coeff16, src[4]);
346     coeff16 += 8;
347     _mm_store_si128((__m128i *)coeff16, src[5]);
348     coeff16 += 8;
349     _mm_store_si128((__m128i *)coeff16, src[6]);
350     coeff16 += 8;
351     _mm_store_si128((__m128i *)coeff16, src[7]);
352   }
353 }
354 
aom_hadamard_8x8_sse2(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff)355 void aom_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
356                            tran_low_t *coeff) {
357   hadamard_8x8_sse2(src_diff, src_stride, coeff, 1);
358 }
359 
hadamard_lp_8x8_sse2(const int16_t * src_diff,ptrdiff_t src_stride,int16_t * coeff)360 static INLINE void hadamard_lp_8x8_sse2(const int16_t *src_diff,
361                                         ptrdiff_t src_stride, int16_t *coeff) {
362   __m128i src[8];
363   src[0] = _mm_load_si128((const __m128i *)src_diff);
364   src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
365   src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
366   src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
367   src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
368   src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
369   src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
370   src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride));
371 
372   hadamard_col8_sse2(src, 0);
373   hadamard_col8_sse2(src, 1);
374 
375   _mm_store_si128((__m128i *)coeff, src[0]);
376   coeff += 8;
377   _mm_store_si128((__m128i *)coeff, src[1]);
378   coeff += 8;
379   _mm_store_si128((__m128i *)coeff, src[2]);
380   coeff += 8;
381   _mm_store_si128((__m128i *)coeff, src[3]);
382   coeff += 8;
383   _mm_store_si128((__m128i *)coeff, src[4]);
384   coeff += 8;
385   _mm_store_si128((__m128i *)coeff, src[5]);
386   coeff += 8;
387   _mm_store_si128((__m128i *)coeff, src[6]);
388   coeff += 8;
389   _mm_store_si128((__m128i *)coeff, src[7]);
390 }
391 
aom_hadamard_lp_8x8_sse2(const int16_t * src_diff,ptrdiff_t src_stride,int16_t * coeff)392 void aom_hadamard_lp_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
393                               int16_t *coeff) {
394   hadamard_lp_8x8_sse2(src_diff, src_stride, coeff);
395 }
396 
aom_hadamard_lp_8x8_dual_sse2(const int16_t * src_diff,ptrdiff_t src_stride,int16_t * coeff)397 void aom_hadamard_lp_8x8_dual_sse2(const int16_t *src_diff,
398                                    ptrdiff_t src_stride, int16_t *coeff) {
399   for (int i = 0; i < 2; i++) {
400     hadamard_lp_8x8_sse2(src_diff + (i * 8), src_stride, coeff + (i * 64));
401   }
402 }
403 
aom_hadamard_lp_16x16_sse2(const int16_t * src_diff,ptrdiff_t src_stride,int16_t * coeff)404 void aom_hadamard_lp_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
405                                 int16_t *coeff) {
406   for (int idx = 0; idx < 4; ++idx) {
407     const int16_t *src_ptr =
408         src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
409     hadamard_lp_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
410   }
411 
412   int16_t *t_coeff = coeff;
413   for (int idx = 0; idx < 64; idx += 8) {
414     __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
415     __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
416     __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
417     __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
418 
419     __m128i b0 = _mm_add_epi16(coeff0, coeff1);
420     __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
421     __m128i b2 = _mm_add_epi16(coeff2, coeff3);
422     __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
423 
424     b0 = _mm_srai_epi16(b0, 1);
425     b1 = _mm_srai_epi16(b1, 1);
426     b2 = _mm_srai_epi16(b2, 1);
427     b3 = _mm_srai_epi16(b3, 1);
428 
429     coeff0 = _mm_add_epi16(b0, b2);
430     coeff1 = _mm_add_epi16(b1, b3);
431     coeff2 = _mm_sub_epi16(b0, b2);
432     coeff3 = _mm_sub_epi16(b1, b3);
433 
434     _mm_store_si128((__m128i *)t_coeff, coeff0);
435     _mm_store_si128((__m128i *)(t_coeff + 64), coeff1);
436     _mm_store_si128((__m128i *)(t_coeff + 128), coeff2);
437     _mm_store_si128((__m128i *)(t_coeff + 192), coeff3);
438 
439     t_coeff += 8;
440   }
441 }
442 
hadamard_16x16_sse2(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff,int is_final)443 static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
444                                        ptrdiff_t src_stride, tran_low_t *coeff,
445                                        int is_final) {
446   // For high bitdepths, it is unnecessary to store_tran_low
447   // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
448   // next stage.  Output to an intermediate buffer first, then store_tran_low()
449   // in the final stage.
450   DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
451   int16_t *t_coeff = temp_coeff;
452   int16_t *coeff16 = (int16_t *)coeff;
453   int idx;
454   for (idx = 0; idx < 4; ++idx) {
455     const int16_t *src_ptr =
456         src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
457     hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64),
458                       0);
459   }
460 
461   for (idx = 0; idx < 64; idx += 8) {
462     __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
463     __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
464     __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
465     __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
466 
467     __m128i b0 = _mm_add_epi16(coeff0, coeff1);
468     __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
469     __m128i b2 = _mm_add_epi16(coeff2, coeff3);
470     __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
471 
472     b0 = _mm_srai_epi16(b0, 1);
473     b1 = _mm_srai_epi16(b1, 1);
474     b2 = _mm_srai_epi16(b2, 1);
475     b3 = _mm_srai_epi16(b3, 1);
476 
477     coeff0 = _mm_add_epi16(b0, b2);
478     coeff1 = _mm_add_epi16(b1, b3);
479     coeff2 = _mm_sub_epi16(b0, b2);
480     coeff3 = _mm_sub_epi16(b1, b3);
481 
482     if (is_final) {
483       store_tran_low_offset_4(coeff0, coeff);
484       store_tran_low_offset_4(coeff1, coeff + 64);
485       store_tran_low_offset_4(coeff2, coeff + 128);
486       store_tran_low_offset_4(coeff3, coeff + 192);
487       coeff += 4;
488     } else {
489       _mm_store_si128((__m128i *)coeff16, coeff0);
490       _mm_store_si128((__m128i *)(coeff16 + 64), coeff1);
491       _mm_store_si128((__m128i *)(coeff16 + 128), coeff2);
492       _mm_store_si128((__m128i *)(coeff16 + 192), coeff3);
493       coeff16 += 8;
494     }
495 
496     t_coeff += 8;
497     // Increment the pointer additionally by 0 and 8 in alternate
498     // iterations(instead of 8) to ensure the coherency with the implementation
499     // of store_tran_low_offset_4()
500     coeff += (((idx >> 3) & 1) << 3);
501   }
502 }
503 
aom_hadamard_16x16_sse2(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff)504 void aom_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
505                              tran_low_t *coeff) {
506   hadamard_16x16_sse2(src_diff, src_stride, coeff, 1);
507 }
508 
aom_hadamard_32x32_sse2(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff)509 void aom_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
510                              tran_low_t *coeff) {
511   // For high bitdepths, it is unnecessary to store_tran_low
512   // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
513   // next stage.  Output to an intermediate buffer first, then store_tran_low()
514   // in the final stage.
515   DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
516   int16_t *t_coeff = temp_coeff;
517   int idx;
518   __m128i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo,
519       b3_lo;
520   __m128i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi,
521       b3_hi;
522   __m128i b0, b1, b2, b3;
523   const __m128i zero = _mm_setzero_si128();
524   for (idx = 0; idx < 4; ++idx) {
525     const int16_t *src_ptr =
526         src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
527     hadamard_16x16_sse2(src_ptr, src_stride,
528                         (tran_low_t *)(t_coeff + idx * 256), 0);
529   }
530 
531   for (idx = 0; idx < 256; idx += 8) {
532     __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
533     __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256));
534     __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512));
535     __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768));
536 
537     // Sign extend 16 bit to 32 bit.
538     sign_extend_16bit_to_32bit_sse2(coeff0, zero, &coeff0_lo, &coeff0_hi);
539     sign_extend_16bit_to_32bit_sse2(coeff1, zero, &coeff1_lo, &coeff1_hi);
540     sign_extend_16bit_to_32bit_sse2(coeff2, zero, &coeff2_lo, &coeff2_hi);
541     sign_extend_16bit_to_32bit_sse2(coeff3, zero, &coeff3_lo, &coeff3_hi);
542 
543     b0_lo = _mm_add_epi32(coeff0_lo, coeff1_lo);
544     b0_hi = _mm_add_epi32(coeff0_hi, coeff1_hi);
545 
546     b1_lo = _mm_sub_epi32(coeff0_lo, coeff1_lo);
547     b1_hi = _mm_sub_epi32(coeff0_hi, coeff1_hi);
548 
549     b2_lo = _mm_add_epi32(coeff2_lo, coeff3_lo);
550     b2_hi = _mm_add_epi32(coeff2_hi, coeff3_hi);
551 
552     b3_lo = _mm_sub_epi32(coeff2_lo, coeff3_lo);
553     b3_hi = _mm_sub_epi32(coeff2_hi, coeff3_hi);
554 
555     b0_lo = _mm_srai_epi32(b0_lo, 2);
556     b1_lo = _mm_srai_epi32(b1_lo, 2);
557     b2_lo = _mm_srai_epi32(b2_lo, 2);
558     b3_lo = _mm_srai_epi32(b3_lo, 2);
559 
560     b0_hi = _mm_srai_epi32(b0_hi, 2);
561     b1_hi = _mm_srai_epi32(b1_hi, 2);
562     b2_hi = _mm_srai_epi32(b2_hi, 2);
563     b3_hi = _mm_srai_epi32(b3_hi, 2);
564 
565     b0 = _mm_packs_epi32(b0_lo, b0_hi);
566     b1 = _mm_packs_epi32(b1_lo, b1_hi);
567     b2 = _mm_packs_epi32(b2_lo, b2_hi);
568     b3 = _mm_packs_epi32(b3_lo, b3_hi);
569 
570     coeff0 = _mm_add_epi16(b0, b2);
571     coeff1 = _mm_add_epi16(b1, b3);
572     store_tran_low_offset_4(coeff0, coeff);
573     store_tran_low_offset_4(coeff1, coeff + 256);
574 
575     coeff2 = _mm_sub_epi16(b0, b2);
576     coeff3 = _mm_sub_epi16(b1, b3);
577     store_tran_low_offset_4(coeff2, coeff + 512);
578     store_tran_low_offset_4(coeff3, coeff + 768);
579 
580     // Increment the pointer by 4 and 12 in alternate iterations(instead of 8)
581     // to ensure the coherency with the implementation of
582     // store_tran_low_offset_4()
583     coeff += (4 + (((idx >> 3) & 1) << 3));
584     t_coeff += 8;
585   }
586 }
587 
aom_satd_sse2(const tran_low_t * coeff,int length)588 int aom_satd_sse2(const tran_low_t *coeff, int length) {
589   int i;
590   const __m128i zero = _mm_setzero_si128();
591   __m128i accum = zero;
592 
593   for (i = 0; i < length; i += 4) {
594     const __m128i src_line = _mm_load_si128((const __m128i *)coeff);
595     const __m128i coeff_sign = _mm_srai_epi32(src_line, 31);
596     const __m128i abs_coeff = invert_sign_32_sse2(src_line, coeff_sign);
597     accum = _mm_add_epi32(accum, abs_coeff);
598     coeff += 4;
599   }
600 
601   {  // cascading summation of accum
602     __m128i hi = _mm_srli_si128(accum, 8);
603     accum = _mm_add_epi32(accum, hi);
604     hi = _mm_srli_epi64(accum, 32);
605     accum = _mm_add_epi32(accum, hi);
606   }
607 
608   return _mm_cvtsi128_si32(accum);
609 }
610 
aom_satd_lp_sse2(const int16_t * coeff,int length)611 int aom_satd_lp_sse2(const int16_t *coeff, int length) {
612   const __m128i zero = _mm_setzero_si128();
613   const __m128i one = _mm_set1_epi16(1);
614   __m128i accum = zero;
615 
616   for (int i = 0; i < length; i += 16) {
617     const __m128i src_line0 = _mm_loadu_si128((const __m128i *)coeff);
618     const __m128i src_line1 = _mm_loadu_si128((const __m128i *)(coeff + 8));
619     const __m128i inv0 = _mm_sub_epi16(zero, src_line0);
620     const __m128i inv1 = _mm_sub_epi16(zero, src_line1);
621     const __m128i abs0 = _mm_max_epi16(src_line0, inv0);  // abs(src_line)
622     const __m128i abs1 = _mm_max_epi16(src_line1, inv1);  // abs(src_line)
623     const __m128i sum0 = _mm_madd_epi16(abs0, one);
624     const __m128i sum1 = _mm_madd_epi16(abs1, one);
625     accum = _mm_add_epi32(accum, sum0);
626     accum = _mm_add_epi32(accum, sum1);
627     coeff += 16;
628   }
629 
630   {  // cascading summation of accum
631     __m128i hi = _mm_srli_si128(accum, 8);
632     accum = _mm_add_epi32(accum, hi);
633     hi = _mm_srli_epi64(accum, 32);
634     accum = _mm_add_epi32(accum, hi);
635   }
636 
637   return _mm_cvtsi128_si32(accum);
638 }
639 
aom_int_pro_row_sse2(int16_t * hbuf,const uint8_t * ref,const int ref_stride,const int width,const int height,int norm_factor)640 void aom_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref,
641                           const int ref_stride, const int width,
642                           const int height, int norm_factor) {
643   // SIMD implementation assumes width and height to be multiple of 16 and 2
644   // respectively. For any odd width or height, SIMD support needs to be added.
645   assert(width % 16 == 0 && height % 2 == 0);
646   __m128i zero = _mm_setzero_si128();
647 
648   for (int wd = 0; wd < width; wd += 16) {
649     const uint8_t *ref_tmp = ref + wd;
650     int16_t *hbuf_tmp = hbuf + wd;
651     __m128i s0 = zero;
652     __m128i s1 = zero;
653     int idx = 0;
654     do {
655       __m128i src_line = _mm_loadu_si128((const __m128i *)ref_tmp);
656       __m128i t0 = _mm_unpacklo_epi8(src_line, zero);
657       __m128i t1 = _mm_unpackhi_epi8(src_line, zero);
658       s0 = _mm_add_epi16(s0, t0);
659       s1 = _mm_add_epi16(s1, t1);
660       ref_tmp += ref_stride;
661 
662       src_line = _mm_loadu_si128((const __m128i *)ref_tmp);
663       t0 = _mm_unpacklo_epi8(src_line, zero);
664       t1 = _mm_unpackhi_epi8(src_line, zero);
665       s0 = _mm_add_epi16(s0, t0);
666       s1 = _mm_add_epi16(s1, t1);
667       ref_tmp += ref_stride;
668       idx += 2;
669     } while (idx < height);
670 
671     s0 = _mm_srai_epi16(s0, norm_factor);
672     s1 = _mm_srai_epi16(s1, norm_factor);
673     _mm_storeu_si128((__m128i *)(hbuf_tmp), s0);
674     _mm_storeu_si128((__m128i *)(hbuf_tmp + 8), s1);
675   }
676 }
677 
aom_int_pro_col_sse2(int16_t * vbuf,const uint8_t * ref,const int ref_stride,const int width,const int height,int norm_factor)678 void aom_int_pro_col_sse2(int16_t *vbuf, const uint8_t *ref,
679                           const int ref_stride, const int width,
680                           const int height, int norm_factor) {
681   // SIMD implementation assumes width to be multiple of 16.
682   assert(width % 16 == 0);
683 
684   for (int ht = 0; ht < height; ht++) {
685     const uint8_t *ref_tmp = ref + (ht * ref_stride);
686     __m128i zero = _mm_setzero_si128();
687     __m128i s0 = zero;
688     __m128i s1, src_line;
689     for (int i = 0; i < width; i += 16) {
690       src_line = _mm_loadu_si128((const __m128i *)ref_tmp);
691       s1 = _mm_sad_epu8(src_line, zero);
692       s0 = _mm_add_epi16(s0, s1);
693       ref_tmp += 16;
694     }
695 
696     s1 = _mm_srli_si128(s0, 8);
697     s0 = _mm_add_epi16(s0, s1);
698     vbuf[ht] = _mm_cvtsi128_si32(s0) >> norm_factor;
699   }
700 }
701