• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
13 #include "vpx_dsp/x86/inv_txfm_sse2.h"
14 #include "vpx_dsp/x86/transpose_sse2.h"
15 #include "vpx_dsp/x86/txfm_common_sse2.h"
16 
vpx_highbd_idct8x8_64_add_sse2(const tran_low_t * input,uint16_t * dest,int stride,int bd)17 void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest,
18                                     int stride, int bd) {
19   tran_low_t out[8 * 8];
20   tran_low_t *outptr = out;
21   int i, j, test;
22   __m128i inptr[8];
23   __m128i min_input, max_input, temp1, temp2, sign_bits;
24   const __m128i zero = _mm_set1_epi16(0);
25   const __m128i sixteen = _mm_set1_epi16(16);
26   const __m128i max = _mm_set1_epi16(6201);
27   const __m128i min = _mm_set1_epi16(-6201);
28   int optimised_cols = 0;
29 
30   // Load input into __m128i & pack to 16 bits
31   for (i = 0; i < 8; i++) {
32     temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
33     temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
34     inptr[i] = _mm_packs_epi32(temp1, temp2);
35   }
36 
37   // Find the min & max for the row transform
38   max_input = _mm_max_epi16(inptr[0], inptr[1]);
39   min_input = _mm_min_epi16(inptr[0], inptr[1]);
40   for (i = 2; i < 8; i++) {
41     max_input = _mm_max_epi16(max_input, inptr[i]);
42     min_input = _mm_min_epi16(min_input, inptr[i]);
43   }
44   max_input = _mm_cmpgt_epi16(max_input, max);
45   min_input = _mm_cmplt_epi16(min_input, min);
46   temp1 = _mm_or_si128(max_input, min_input);
47   test = _mm_movemask_epi8(temp1);
48 
49   if (!test) {
50     // Do the row transform
51     idct8_sse2(inptr);
52 
53     // Find the min & max for the column transform
54     max_input = _mm_max_epi16(inptr[0], inptr[1]);
55     min_input = _mm_min_epi16(inptr[0], inptr[1]);
56     for (i = 2; i < 8; i++) {
57       max_input = _mm_max_epi16(max_input, inptr[i]);
58       min_input = _mm_min_epi16(min_input, inptr[i]);
59     }
60     max_input = _mm_cmpgt_epi16(max_input, max);
61     min_input = _mm_cmplt_epi16(min_input, min);
62     temp1 = _mm_or_si128(max_input, min_input);
63     test = _mm_movemask_epi8(temp1);
64 
65     if (test) {
66       array_transpose_8x8(inptr, inptr);
67       for (i = 0; i < 8; i++) {
68         sign_bits = _mm_cmplt_epi16(inptr[i], zero);
69         temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
70         temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
71         _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
72         _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
73       }
74     } else {
75       // Set to use the optimised transform for the column
76       optimised_cols = 1;
77     }
78   } else {
79     // Run the un-optimised row transform
80     for (i = 0; i < 8; ++i) {
81       vpx_highbd_idct8_c(input, outptr, bd);
82       input += 8;
83       outptr += 8;
84     }
85   }
86 
87   if (optimised_cols) {
88     idct8_sse2(inptr);
89 
90     // Final round & shift and Reconstruction and Store
91     {
92       __m128i d[8];
93       for (i = 0; i < 8; i++) {
94         inptr[i] = _mm_add_epi16(inptr[i], sixteen);
95         d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
96         inptr[i] = _mm_srai_epi16(inptr[i], 5);
97         d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
98         // Store
99         _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
100       }
101     }
102   } else {
103     // Run the un-optimised column transform
104     tran_low_t temp_in[8], temp_out[8];
105     for (i = 0; i < 8; ++i) {
106       for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
107       vpx_highbd_idct8_c(temp_in, temp_out, bd);
108       for (j = 0; j < 8; ++j) {
109         dest[j * stride + i] = highbd_clip_pixel_add(
110             dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
111       }
112     }
113   }
114 }
115 
vpx_highbd_idct8x8_12_add_sse2(const tran_low_t * input,uint16_t * dest,int stride,int bd)116 void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,
117                                     int stride, int bd) {
118   tran_low_t out[8 * 8] = { 0 };
119   tran_low_t *outptr = out;
120   int i, j, test;
121   __m128i inptr[8];
122   __m128i min_input, max_input, temp1, temp2, sign_bits;
123   const __m128i zero = _mm_set1_epi16(0);
124   const __m128i sixteen = _mm_set1_epi16(16);
125   const __m128i max = _mm_set1_epi16(6201);
126   const __m128i min = _mm_set1_epi16(-6201);
127   int optimised_cols = 0;
128 
129   // Load input into __m128i & pack to 16 bits
130   for (i = 0; i < 8; i++) {
131     temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
132     temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
133     inptr[i] = _mm_packs_epi32(temp1, temp2);
134   }
135 
136   // Find the min & max for the row transform
137   // only first 4 row has non-zero coefs
138   max_input = _mm_max_epi16(inptr[0], inptr[1]);
139   min_input = _mm_min_epi16(inptr[0], inptr[1]);
140   for (i = 2; i < 4; i++) {
141     max_input = _mm_max_epi16(max_input, inptr[i]);
142     min_input = _mm_min_epi16(min_input, inptr[i]);
143   }
144   max_input = _mm_cmpgt_epi16(max_input, max);
145   min_input = _mm_cmplt_epi16(min_input, min);
146   temp1 = _mm_or_si128(max_input, min_input);
147   test = _mm_movemask_epi8(temp1);
148 
149   if (!test) {
150     // Do the row transform
151     idct8_sse2(inptr);
152 
153     // Find the min & max for the column transform
154     // N.B. Only first 4 cols contain non-zero coeffs
155     max_input = _mm_max_epi16(inptr[0], inptr[1]);
156     min_input = _mm_min_epi16(inptr[0], inptr[1]);
157     for (i = 2; i < 8; i++) {
158       max_input = _mm_max_epi16(max_input, inptr[i]);
159       min_input = _mm_min_epi16(min_input, inptr[i]);
160     }
161     max_input = _mm_cmpgt_epi16(max_input, max);
162     min_input = _mm_cmplt_epi16(min_input, min);
163     temp1 = _mm_or_si128(max_input, min_input);
164     test = _mm_movemask_epi8(temp1);
165 
166     if (test) {
167       // Use fact only first 4 rows contain non-zero coeffs
168       array_transpose_4X8(inptr, inptr);
169       for (i = 0; i < 4; i++) {
170         sign_bits = _mm_cmplt_epi16(inptr[i], zero);
171         temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
172         temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
173         _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
174         _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
175       }
176     } else {
177       // Set to use the optimised transform for the column
178       optimised_cols = 1;
179     }
180   } else {
181     // Run the un-optimised row transform
182     for (i = 0; i < 4; ++i) {
183       vpx_highbd_idct8_c(input, outptr, bd);
184       input += 8;
185       outptr += 8;
186     }
187   }
188 
189   if (optimised_cols) {
190     idct8_sse2(inptr);
191 
192     // Final round & shift and Reconstruction and Store
193     {
194       __m128i d[8];
195       for (i = 0; i < 8; i++) {
196         inptr[i] = _mm_add_epi16(inptr[i], sixteen);
197         d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
198         inptr[i] = _mm_srai_epi16(inptr[i], 5);
199         d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
200         // Store
201         _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
202       }
203     }
204   } else {
205     // Run the un-optimised column transform
206     tran_low_t temp_in[8], temp_out[8];
207     for (i = 0; i < 8; ++i) {
208       for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
209       vpx_highbd_idct8_c(temp_in, temp_out, bd);
210       for (j = 0; j < 8; ++j) {
211         dest[j * stride + i] = highbd_clip_pixel_add(
212             dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
213       }
214     }
215   }
216 }
217 
vpx_highbd_idct8x8_1_add_sse2(const tran_low_t * input,uint16_t * dest,int stride,int bd)218 void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest,
219                                    int stride, int bd) {
220   highbd_idct_1_add_kernel(input, dest, stride, bd, 8);
221 }
222