• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <emmintrin.h>
12 
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx/vpx_integer.h"
15 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
16 #include "vpx_ports/mem.h"
17 
vpx_minmax_8x8_sse2(const uint8_t * s,int p,const uint8_t * d,int dp,int * min,int * max)18 void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
19                          int *min, int *max) {
20   __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
21   u0 = _mm_setzero_si128();
22   // Row 0
23   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
24   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
25   diff = _mm_subs_epi16(s0, d0);
26   negdiff = _mm_subs_epi16(u0, diff);
27   absdiff0 = _mm_max_epi16(diff, negdiff);
28   // Row 1
29   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
30   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
31   diff = _mm_subs_epi16(s0, d0);
32   negdiff = _mm_subs_epi16(u0, diff);
33   absdiff = _mm_max_epi16(diff, negdiff);
34   maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
35   minabsdiff = _mm_min_epi16(absdiff0, absdiff);
36   // Row 2
37   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
38   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
39   diff = _mm_subs_epi16(s0, d0);
40   negdiff = _mm_subs_epi16(u0, diff);
41   absdiff = _mm_max_epi16(diff, negdiff);
42   maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
43   minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
44   // Row 3
45   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
46   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
47   diff = _mm_subs_epi16(s0, d0);
48   negdiff = _mm_subs_epi16(u0, diff);
49   absdiff = _mm_max_epi16(diff, negdiff);
50   maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
51   minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
52   // Row 4
53   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
54   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
55   diff = _mm_subs_epi16(s0, d0);
56   negdiff = _mm_subs_epi16(u0, diff);
57   absdiff = _mm_max_epi16(diff, negdiff);
58   maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
59   minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
60   // Row 5
61   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
62   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
63   diff = _mm_subs_epi16(s0, d0);
64   negdiff = _mm_subs_epi16(u0, diff);
65   absdiff = _mm_max_epi16(diff, negdiff);
66   maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
67   minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
68   // Row 6
69   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
70   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
71   diff = _mm_subs_epi16(s0, d0);
72   negdiff = _mm_subs_epi16(u0, diff);
73   absdiff = _mm_max_epi16(diff, negdiff);
74   maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
75   minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
76   // Row 7
77   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
78   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
79   diff = _mm_subs_epi16(s0, d0);
80   negdiff = _mm_subs_epi16(u0, diff);
81   absdiff = _mm_max_epi16(diff, negdiff);
82   maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
83   minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
84 
85   maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
86   maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
87   maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
88   *max = _mm_extract_epi16(maxabsdiff, 0);
89 
90   minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
91   minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
92   minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
93   *min = _mm_extract_epi16(minabsdiff, 0);
94 }
95 
vpx_avg_8x8_sse2(const uint8_t * s,int p)96 unsigned int vpx_avg_8x8_sse2(const uint8_t *s, int p) {
97   __m128i s0, s1, u0;
98   unsigned int avg = 0;
99   u0 = _mm_setzero_si128();
100   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
101   s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
102   s0 = _mm_adds_epu16(s0, s1);
103   s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
104   s0 = _mm_adds_epu16(s0, s1);
105   s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
106   s0 = _mm_adds_epu16(s0, s1);
107   s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
108   s0 = _mm_adds_epu16(s0, s1);
109   s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
110   s0 = _mm_adds_epu16(s0, s1);
111   s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
112   s0 = _mm_adds_epu16(s0, s1);
113   s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
114   s0 = _mm_adds_epu16(s0, s1);
115 
116   s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
117   s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
118   s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
119   avg = _mm_extract_epi16(s0, 0);
120   return (avg + 32) >> 6;
121 }
122 
vpx_avg_4x4_sse2(const uint8_t * s,int p)123 unsigned int vpx_avg_4x4_sse2(const uint8_t *s, int p) {
124   __m128i s0, s1, u0;
125   unsigned int avg = 0;
126   u0 = _mm_setzero_si128();
127   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
128   s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
129   s0 = _mm_adds_epu16(s0, s1);
130   s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
131   s0 = _mm_adds_epu16(s0, s1);
132   s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
133   s0 = _mm_adds_epu16(s0, s1);
134 
135   s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
136   s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
137   avg = _mm_extract_epi16(s0, 0);
138   return (avg + 8) >> 4;
139 }
140 
hadamard_col8_sse2(__m128i * in,int iter)141 static void hadamard_col8_sse2(__m128i *in, int iter) {
142   __m128i a0 = in[0];
143   __m128i a1 = in[1];
144   __m128i a2 = in[2];
145   __m128i a3 = in[3];
146   __m128i a4 = in[4];
147   __m128i a5 = in[5];
148   __m128i a6 = in[6];
149   __m128i a7 = in[7];
150 
151   __m128i b0 = _mm_add_epi16(a0, a1);
152   __m128i b1 = _mm_sub_epi16(a0, a1);
153   __m128i b2 = _mm_add_epi16(a2, a3);
154   __m128i b3 = _mm_sub_epi16(a2, a3);
155   __m128i b4 = _mm_add_epi16(a4, a5);
156   __m128i b5 = _mm_sub_epi16(a4, a5);
157   __m128i b6 = _mm_add_epi16(a6, a7);
158   __m128i b7 = _mm_sub_epi16(a6, a7);
159 
160   a0 = _mm_add_epi16(b0, b2);
161   a1 = _mm_add_epi16(b1, b3);
162   a2 = _mm_sub_epi16(b0, b2);
163   a3 = _mm_sub_epi16(b1, b3);
164   a4 = _mm_add_epi16(b4, b6);
165   a5 = _mm_add_epi16(b5, b7);
166   a6 = _mm_sub_epi16(b4, b6);
167   a7 = _mm_sub_epi16(b5, b7);
168 
169   if (iter == 0) {
170     b0 = _mm_add_epi16(a0, a4);
171     b7 = _mm_add_epi16(a1, a5);
172     b3 = _mm_add_epi16(a2, a6);
173     b4 = _mm_add_epi16(a3, a7);
174     b2 = _mm_sub_epi16(a0, a4);
175     b6 = _mm_sub_epi16(a1, a5);
176     b1 = _mm_sub_epi16(a2, a6);
177     b5 = _mm_sub_epi16(a3, a7);
178 
179     a0 = _mm_unpacklo_epi16(b0, b1);
180     a1 = _mm_unpacklo_epi16(b2, b3);
181     a2 = _mm_unpackhi_epi16(b0, b1);
182     a3 = _mm_unpackhi_epi16(b2, b3);
183     a4 = _mm_unpacklo_epi16(b4, b5);
184     a5 = _mm_unpacklo_epi16(b6, b7);
185     a6 = _mm_unpackhi_epi16(b4, b5);
186     a7 = _mm_unpackhi_epi16(b6, b7);
187 
188     b0 = _mm_unpacklo_epi32(a0, a1);
189     b1 = _mm_unpacklo_epi32(a4, a5);
190     b2 = _mm_unpackhi_epi32(a0, a1);
191     b3 = _mm_unpackhi_epi32(a4, a5);
192     b4 = _mm_unpacklo_epi32(a2, a3);
193     b5 = _mm_unpacklo_epi32(a6, a7);
194     b6 = _mm_unpackhi_epi32(a2, a3);
195     b7 = _mm_unpackhi_epi32(a6, a7);
196 
197     in[0] = _mm_unpacklo_epi64(b0, b1);
198     in[1] = _mm_unpackhi_epi64(b0, b1);
199     in[2] = _mm_unpacklo_epi64(b2, b3);
200     in[3] = _mm_unpackhi_epi64(b2, b3);
201     in[4] = _mm_unpacklo_epi64(b4, b5);
202     in[5] = _mm_unpackhi_epi64(b4, b5);
203     in[6] = _mm_unpacklo_epi64(b6, b7);
204     in[7] = _mm_unpackhi_epi64(b6, b7);
205   } else {
206     in[0] = _mm_add_epi16(a0, a4);
207     in[7] = _mm_add_epi16(a1, a5);
208     in[3] = _mm_add_epi16(a2, a6);
209     in[4] = _mm_add_epi16(a3, a7);
210     in[2] = _mm_sub_epi16(a0, a4);
211     in[6] = _mm_sub_epi16(a1, a5);
212     in[1] = _mm_sub_epi16(a2, a6);
213     in[5] = _mm_sub_epi16(a3, a7);
214   }
215 }
216 
vpx_hadamard_8x8_sse2(int16_t const * src_diff,ptrdiff_t src_stride,tran_low_t * coeff)217 void vpx_hadamard_8x8_sse2(int16_t const *src_diff, ptrdiff_t src_stride,
218                            tran_low_t *coeff) {
219   __m128i src[8];
220   src[0] = _mm_load_si128((const __m128i *)src_diff);
221   src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
222   src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
223   src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
224   src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
225   src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
226   src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
227   src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
228 
229   hadamard_col8_sse2(src, 0);
230   hadamard_col8_sse2(src, 1);
231 
232   store_tran_low(src[0], coeff);
233   coeff += 8;
234   store_tran_low(src[1], coeff);
235   coeff += 8;
236   store_tran_low(src[2], coeff);
237   coeff += 8;
238   store_tran_low(src[3], coeff);
239   coeff += 8;
240   store_tran_low(src[4], coeff);
241   coeff += 8;
242   store_tran_low(src[5], coeff);
243   coeff += 8;
244   store_tran_low(src[6], coeff);
245   coeff += 8;
246   store_tran_low(src[7], coeff);
247 }
248 
vpx_hadamard_16x16_sse2(int16_t const * src_diff,ptrdiff_t src_stride,tran_low_t * coeff)249 void vpx_hadamard_16x16_sse2(int16_t const *src_diff, ptrdiff_t src_stride,
250                              tran_low_t *coeff) {
251   int idx;
252   for (idx = 0; idx < 4; ++idx) {
253     int16_t const *src_ptr =
254         src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
255     vpx_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
256   }
257 
258   for (idx = 0; idx < 64; idx += 8) {
259     __m128i coeff0 = load_tran_low(coeff);
260     __m128i coeff1 = load_tran_low(coeff + 64);
261     __m128i coeff2 = load_tran_low(coeff + 128);
262     __m128i coeff3 = load_tran_low(coeff + 192);
263 
264     __m128i b0 = _mm_add_epi16(coeff0, coeff1);
265     __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
266     __m128i b2 = _mm_add_epi16(coeff2, coeff3);
267     __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
268 
269     b0 = _mm_srai_epi16(b0, 1);
270     b1 = _mm_srai_epi16(b1, 1);
271     b2 = _mm_srai_epi16(b2, 1);
272     b3 = _mm_srai_epi16(b3, 1);
273 
274     coeff0 = _mm_add_epi16(b0, b2);
275     coeff1 = _mm_add_epi16(b1, b3);
276     store_tran_low(coeff0, coeff);
277     store_tran_low(coeff1, coeff + 64);
278 
279     coeff2 = _mm_sub_epi16(b0, b2);
280     coeff3 = _mm_sub_epi16(b1, b3);
281     store_tran_low(coeff2, coeff + 128);
282     store_tran_low(coeff3, coeff + 192);
283 
284     coeff += 8;
285   }
286 }
287 
vpx_satd_sse2(const tran_low_t * coeff,int length)288 int vpx_satd_sse2(const tran_low_t *coeff, int length) {
289   int i;
290   const __m128i zero = _mm_setzero_si128();
291   __m128i accum = zero;
292 
293   for (i = 0; i < length; i += 8) {
294     const __m128i src_line = load_tran_low(coeff);
295     const __m128i inv = _mm_sub_epi16(zero, src_line);
296     const __m128i abs = _mm_max_epi16(src_line, inv);  // abs(src_line)
297     const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
298     const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
299     const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
300     accum = _mm_add_epi32(accum, sum);
301     coeff += 8;
302   }
303 
304   {  // cascading summation of accum
305     __m128i hi = _mm_srli_si128(accum, 8);
306     accum = _mm_add_epi32(accum, hi);
307     hi = _mm_srli_epi64(accum, 32);
308     accum = _mm_add_epi32(accum, hi);
309   }
310 
311   return _mm_cvtsi128_si32(accum);
312 }
313 
vpx_int_pro_row_sse2(int16_t * hbuf,uint8_t const * ref,const int ref_stride,const int height)314 void vpx_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref,
315                           const int ref_stride, const int height) {
316   int idx;
317   __m128i zero = _mm_setzero_si128();
318   __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
319   __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
320   __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
321   __m128i t0, t1;
322   int height_1 = height - 1;
323   ref += ref_stride;
324 
325   for (idx = 1; idx < height_1; idx += 2) {
326     src_line = _mm_loadu_si128((const __m128i *)ref);
327     t0 = _mm_unpacklo_epi8(src_line, zero);
328     t1 = _mm_unpackhi_epi8(src_line, zero);
329     s0 = _mm_adds_epu16(s0, t0);
330     s1 = _mm_adds_epu16(s1, t1);
331     ref += ref_stride;
332 
333     src_line = _mm_loadu_si128((const __m128i *)ref);
334     t0 = _mm_unpacklo_epi8(src_line, zero);
335     t1 = _mm_unpackhi_epi8(src_line, zero);
336     s0 = _mm_adds_epu16(s0, t0);
337     s1 = _mm_adds_epu16(s1, t1);
338     ref += ref_stride;
339   }
340 
341   src_line = _mm_loadu_si128((const __m128i *)ref);
342   t0 = _mm_unpacklo_epi8(src_line, zero);
343   t1 = _mm_unpackhi_epi8(src_line, zero);
344   s0 = _mm_adds_epu16(s0, t0);
345   s1 = _mm_adds_epu16(s1, t1);
346 
347   if (height == 64) {
348     s0 = _mm_srai_epi16(s0, 5);
349     s1 = _mm_srai_epi16(s1, 5);
350   } else if (height == 32) {
351     s0 = _mm_srai_epi16(s0, 4);
352     s1 = _mm_srai_epi16(s1, 4);
353   } else {
354     s0 = _mm_srai_epi16(s0, 3);
355     s1 = _mm_srai_epi16(s1, 3);
356   }
357 
358   _mm_storeu_si128((__m128i *)hbuf, s0);
359   hbuf += 8;
360   _mm_storeu_si128((__m128i *)hbuf, s1);
361 }
362 
vpx_int_pro_col_sse2(uint8_t const * ref,const int width)363 int16_t vpx_int_pro_col_sse2(uint8_t const *ref, const int width) {
364   __m128i zero = _mm_setzero_si128();
365   __m128i src_line = _mm_load_si128((const __m128i *)ref);
366   __m128i s0 = _mm_sad_epu8(src_line, zero);
367   __m128i s1;
368   int i;
369 
370   for (i = 16; i < width; i += 16) {
371     ref += 16;
372     src_line = _mm_load_si128((const __m128i *)ref);
373     s1 = _mm_sad_epu8(src_line, zero);
374     s0 = _mm_adds_epu16(s0, s1);
375   }
376 
377   s1 = _mm_srli_si128(s0, 8);
378   s0 = _mm_adds_epu16(s0, s1);
379 
380   return _mm_extract_epi16(s0, 0);
381 }
382 
vpx_vector_var_sse2(int16_t const * ref,int16_t const * src,const int bwl)383 int vpx_vector_var_sse2(int16_t const *ref, int16_t const *src, const int bwl) {
384   int idx;
385   int width = 4 << bwl;
386   int16_t mean;
387   __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
388   __m128i v1 = _mm_load_si128((const __m128i *)src);
389   __m128i diff = _mm_subs_epi16(v0, v1);
390   __m128i sum = diff;
391   __m128i sse = _mm_madd_epi16(diff, diff);
392 
393   ref += 8;
394   src += 8;
395 
396   for (idx = 8; idx < width; idx += 8) {
397     v0 = _mm_loadu_si128((const __m128i *)ref);
398     v1 = _mm_load_si128((const __m128i *)src);
399     diff = _mm_subs_epi16(v0, v1);
400 
401     sum = _mm_add_epi16(sum, diff);
402     v0 = _mm_madd_epi16(diff, diff);
403     sse = _mm_add_epi32(sse, v0);
404 
405     ref += 8;
406     src += 8;
407   }
408 
409   v0 = _mm_srli_si128(sum, 8);
410   sum = _mm_add_epi16(sum, v0);
411   v0 = _mm_srli_epi64(sum, 32);
412   sum = _mm_add_epi16(sum, v0);
413   v0 = _mm_srli_epi32(sum, 16);
414   sum = _mm_add_epi16(sum, v0);
415 
416   v1 = _mm_srli_si128(sse, 8);
417   sse = _mm_add_epi32(sse, v1);
418   v1 = _mm_srli_epi64(sse, 32);
419   sse = _mm_add_epi32(sse, v1);
420 
421   mean = _mm_extract_epi16(sum, 0);
422 
423   return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
424 }
425