1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
13 #include "vpx_dsp/x86/inv_txfm_sse2.h"
14 #include "vpx_dsp/x86/transpose_sse2.h"
15 #include "vpx_dsp/x86/txfm_common_sse2.h"
16
vpx_highbd_idct8x8_64_add_sse2(const tran_low_t * input,uint16_t * dest,int stride,int bd)17 void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest,
18 int stride, int bd) {
19 tran_low_t out[8 * 8];
20 tran_low_t *outptr = out;
21 int i, j, test;
22 __m128i inptr[8];
23 __m128i min_input, max_input, temp1, temp2, sign_bits;
24 const __m128i zero = _mm_set1_epi16(0);
25 const __m128i sixteen = _mm_set1_epi16(16);
26 const __m128i max = _mm_set1_epi16(6201);
27 const __m128i min = _mm_set1_epi16(-6201);
28 int optimised_cols = 0;
29
30 // Load input into __m128i & pack to 16 bits
31 for (i = 0; i < 8; i++) {
32 temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
33 temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
34 inptr[i] = _mm_packs_epi32(temp1, temp2);
35 }
36
37 // Find the min & max for the row transform
38 max_input = _mm_max_epi16(inptr[0], inptr[1]);
39 min_input = _mm_min_epi16(inptr[0], inptr[1]);
40 for (i = 2; i < 8; i++) {
41 max_input = _mm_max_epi16(max_input, inptr[i]);
42 min_input = _mm_min_epi16(min_input, inptr[i]);
43 }
44 max_input = _mm_cmpgt_epi16(max_input, max);
45 min_input = _mm_cmplt_epi16(min_input, min);
46 temp1 = _mm_or_si128(max_input, min_input);
47 test = _mm_movemask_epi8(temp1);
48
49 if (!test) {
50 // Do the row transform
51 idct8_sse2(inptr);
52
53 // Find the min & max for the column transform
54 max_input = _mm_max_epi16(inptr[0], inptr[1]);
55 min_input = _mm_min_epi16(inptr[0], inptr[1]);
56 for (i = 2; i < 8; i++) {
57 max_input = _mm_max_epi16(max_input, inptr[i]);
58 min_input = _mm_min_epi16(min_input, inptr[i]);
59 }
60 max_input = _mm_cmpgt_epi16(max_input, max);
61 min_input = _mm_cmplt_epi16(min_input, min);
62 temp1 = _mm_or_si128(max_input, min_input);
63 test = _mm_movemask_epi8(temp1);
64
65 if (test) {
66 array_transpose_8x8(inptr, inptr);
67 for (i = 0; i < 8; i++) {
68 sign_bits = _mm_cmplt_epi16(inptr[i], zero);
69 temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
70 temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
71 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
72 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
73 }
74 } else {
75 // Set to use the optimised transform for the column
76 optimised_cols = 1;
77 }
78 } else {
79 // Run the un-optimised row transform
80 for (i = 0; i < 8; ++i) {
81 vpx_highbd_idct8_c(input, outptr, bd);
82 input += 8;
83 outptr += 8;
84 }
85 }
86
87 if (optimised_cols) {
88 idct8_sse2(inptr);
89
90 // Final round & shift and Reconstruction and Store
91 {
92 __m128i d[8];
93 for (i = 0; i < 8; i++) {
94 inptr[i] = _mm_add_epi16(inptr[i], sixteen);
95 d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
96 inptr[i] = _mm_srai_epi16(inptr[i], 5);
97 d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
98 // Store
99 _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
100 }
101 }
102 } else {
103 // Run the un-optimised column transform
104 tran_low_t temp_in[8], temp_out[8];
105 for (i = 0; i < 8; ++i) {
106 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
107 vpx_highbd_idct8_c(temp_in, temp_out, bd);
108 for (j = 0; j < 8; ++j) {
109 dest[j * stride + i] = highbd_clip_pixel_add(
110 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
111 }
112 }
113 }
114 }
115
vpx_highbd_idct8x8_12_add_sse2(const tran_low_t * input,uint16_t * dest,int stride,int bd)116 void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,
117 int stride, int bd) {
118 tran_low_t out[8 * 8] = { 0 };
119 tran_low_t *outptr = out;
120 int i, j, test;
121 __m128i inptr[8];
122 __m128i min_input, max_input, temp1, temp2, sign_bits;
123 const __m128i zero = _mm_set1_epi16(0);
124 const __m128i sixteen = _mm_set1_epi16(16);
125 const __m128i max = _mm_set1_epi16(6201);
126 const __m128i min = _mm_set1_epi16(-6201);
127 int optimised_cols = 0;
128
129 // Load input into __m128i & pack to 16 bits
130 for (i = 0; i < 8; i++) {
131 temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
132 temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
133 inptr[i] = _mm_packs_epi32(temp1, temp2);
134 }
135
136 // Find the min & max for the row transform
137 // only first 4 row has non-zero coefs
138 max_input = _mm_max_epi16(inptr[0], inptr[1]);
139 min_input = _mm_min_epi16(inptr[0], inptr[1]);
140 for (i = 2; i < 4; i++) {
141 max_input = _mm_max_epi16(max_input, inptr[i]);
142 min_input = _mm_min_epi16(min_input, inptr[i]);
143 }
144 max_input = _mm_cmpgt_epi16(max_input, max);
145 min_input = _mm_cmplt_epi16(min_input, min);
146 temp1 = _mm_or_si128(max_input, min_input);
147 test = _mm_movemask_epi8(temp1);
148
149 if (!test) {
150 // Do the row transform
151 idct8_sse2(inptr);
152
153 // Find the min & max for the column transform
154 // N.B. Only first 4 cols contain non-zero coeffs
155 max_input = _mm_max_epi16(inptr[0], inptr[1]);
156 min_input = _mm_min_epi16(inptr[0], inptr[1]);
157 for (i = 2; i < 8; i++) {
158 max_input = _mm_max_epi16(max_input, inptr[i]);
159 min_input = _mm_min_epi16(min_input, inptr[i]);
160 }
161 max_input = _mm_cmpgt_epi16(max_input, max);
162 min_input = _mm_cmplt_epi16(min_input, min);
163 temp1 = _mm_or_si128(max_input, min_input);
164 test = _mm_movemask_epi8(temp1);
165
166 if (test) {
167 // Use fact only first 4 rows contain non-zero coeffs
168 array_transpose_4X8(inptr, inptr);
169 for (i = 0; i < 4; i++) {
170 sign_bits = _mm_cmplt_epi16(inptr[i], zero);
171 temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
172 temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
173 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
174 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
175 }
176 } else {
177 // Set to use the optimised transform for the column
178 optimised_cols = 1;
179 }
180 } else {
181 // Run the un-optimised row transform
182 for (i = 0; i < 4; ++i) {
183 vpx_highbd_idct8_c(input, outptr, bd);
184 input += 8;
185 outptr += 8;
186 }
187 }
188
189 if (optimised_cols) {
190 idct8_sse2(inptr);
191
192 // Final round & shift and Reconstruction and Store
193 {
194 __m128i d[8];
195 for (i = 0; i < 8; i++) {
196 inptr[i] = _mm_add_epi16(inptr[i], sixteen);
197 d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
198 inptr[i] = _mm_srai_epi16(inptr[i], 5);
199 d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
200 // Store
201 _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
202 }
203 }
204 } else {
205 // Run the un-optimised column transform
206 tran_low_t temp_in[8], temp_out[8];
207 for (i = 0; i < 8; ++i) {
208 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
209 vpx_highbd_idct8_c(temp_in, temp_out, bd);
210 for (j = 0; j < 8; ++j) {
211 dest[j * stride + i] = highbd_clip_pixel_add(
212 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
213 }
214 }
215 }
216 }
217
vpx_highbd_idct8x8_1_add_sse2(const tran_low_t * input,uint16_t * dest,int stride,int bd)218 void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest,
219 int stride, int bd) {
220 highbd_idct_1_add_kernel(input, dest, stride, bd, 8);
221 }
222